diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java
index c331451d5..5559db77b 100644
--- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java
+++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java
@@ -47,13 +47,15 @@
package org.broadinstitute.sting.gatk.arguments;
import org.broadinstitute.sting.commandline.*;
-import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel;
-import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine;
+import org.broadinstitute.sting.gatk.walkers.genotyper.OutputMode;
import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.AFCalcFactory;
import org.broadinstitute.sting.utils.collections.DefaultHashMap;
+import org.broadinstitute.sting.utils.variant.HomoSapiens;
import org.broadinstitute.variant.variantcontext.VariantContext;
import java.io.File;
+import java.lang.reflect.Field;
+import java.lang.reflect.Modifier;
import java.util.Collections;
import java.util.List;
import java.util.Map;
@@ -66,7 +68,7 @@ import java.util.Map;
* This is pulled out so that every caller isn't exposed to the arguments from every other caller.
*/
-public class StandardCallerArgumentCollection {
+public class StandardCallerArgumentCollection implements Cloneable {
/**
* The expected heterozygosity value used to compute prior probability that a locus is non-reference.
*
@@ -96,16 +98,16 @@ public class StandardCallerArgumentCollection {
* which determines how many chromosomes each individual in the species carries.
*/
@Argument(fullName = "heterozygosity", shortName = "hets", doc = "Heterozygosity value used to compute prior likelihoods for any locus. See the GATKDocs for full details on the meaning of this population genetics concept", required = false)
- public Double heterozygosity = UnifiedGenotyperEngine.HUMAN_SNP_HETEROZYGOSITY;
+ public Double snpHeterozygosity = HomoSapiens.SNP_HETEROZYGOSITY;
/**
* This argument informs the prior probability of having an indel at a site.
*/
@Argument(fullName = "indel_heterozygosity", shortName = "indelHeterozygosity", doc = "Heterozygosity for indel calling. See the GATKDocs for heterozygosity for full details on the meaning of this population genetics concept", required = false)
- public double INDEL_HETEROZYGOSITY = 1.0/8000;
+ public double indelHeterozygosity = HomoSapiens.INDEL_HETEROZYGOSITY;
@Argument(fullName = "genotyping_mode", shortName = "gt_mode", doc = "Specifies how to determine the alternate alleles to use for genotyping", required = false)
- public GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE GenotypingMode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY;
+ public org.broadinstitute.sting.gatk.walkers.genotyper.GenotypingMode genotypingMode = org.broadinstitute.sting.gatk.walkers.genotyper.GenotypingMode.DISCOVERY;
/**
* The minimum phred-scaled Qscore threshold to separate high confidence from low confidence calls. Only genotypes with
@@ -209,22 +211,76 @@ public class StandardCallerArgumentCollection {
@Argument(shortName = "logExactCalls", doc="x", required=false)
public File exactCallsLog = null;
+ /**
+ * Sample ploidy - equivalent to number of chromosomes per pool. In pooled experiments this should be = # of samples in pool * individual sample ploidy
+ */
+ @Argument(shortName="ploidy", fullName="sample_ploidy", doc="Ploidy (number of chromosomes) per sample. For pooled data, set to (Number of samples in each pool * Sample Ploidy).", required=false)
+ public int samplePloidy = HomoSapiens.DEFAULT_PLOIDY;
+
+ @Argument(fullName = "output_mode", shortName = "out_mode", doc = "Specifies which type of calls we should output", required = false)
+ public OutputMode outputMode = OutputMode.EMIT_VARIANTS_ONLY;
+
+ /**
+ * Advanced, experimental argument: if SNP likelihood model is specified, and if EMIT_ALL_SITES output mode is set, when we set this argument then we will also emit PLs at all sites.
+ * This will give a measure of reference confidence and a measure of which alt alleles are more plausible (if any).
+ * WARNINGS:
+ * - This feature will inflate VCF file size considerably.
+ * - All SNP ALT alleles will be emitted with corresponding 10 PL values.
+ * - An error will be emitted if EMIT_ALL_SITES is not set, or if anything other than diploid SNP model is used
+ */
+ @Advanced
+ @Argument(fullName = "allSitePLs", shortName = "allSitePLs", doc = "Annotate all sites with PLs", required = false)
+ public boolean annotateAllSitesWithPLs = false;
+
+
+ /**
+ * Creates a Standard caller argument collection with default values.
+ */
public StandardCallerArgumentCollection() { }
- // Developers must remember to add any newly added arguments to the list here as well otherwise they won't get changed from their default value!
- public StandardCallerArgumentCollection(final StandardCallerArgumentCollection SCAC) {
- this.alleles = SCAC.alleles;
- this.GenotypingMode = SCAC.GenotypingMode;
- this.heterozygosity = SCAC.heterozygosity;
- this.INDEL_HETEROZYGOSITY = SCAC.INDEL_HETEROZYGOSITY;
- this.MAX_ALTERNATE_ALLELES = SCAC.MAX_ALTERNATE_ALLELES;
- this.STANDARD_CONFIDENCE_FOR_CALLING = SCAC.STANDARD_CONFIDENCE_FOR_CALLING;
- this.STANDARD_CONFIDENCE_FOR_EMITTING = SCAC.STANDARD_CONFIDENCE_FOR_EMITTING;
- this.CONTAMINATION_FRACTION = SCAC.CONTAMINATION_FRACTION;
- this.CONTAMINATION_FRACTION_FILE=SCAC.CONTAMINATION_FRACTION_FILE;
- this.exactCallsLog = SCAC.exactCallsLog;
- this.sampleContamination=SCAC.sampleContamination;
- this.AFmodel = SCAC.AFmodel;
- this.inputPrior = SCAC.inputPrior;
+ /**
+ * "Casts" a caller argument collection into another type.
+ *
+ *
Common fields values are copied across
+ * @param clazz the class of the result.
+ * @param result argument collection class.
+ * @return never {@code null}.
+ */
+ public T cloneTo(final Class clazz) {
+ // short cut: just use regular clone if it happens to be the same class.
+ if (clazz == getClass())
+ return (T) clone();
+ try {
+ final T result = clazz.newInstance();
+ for (final Field field : getClass().getFields()) {
+ // just copy common fields.
+ if (!field.getDeclaringClass().isAssignableFrom(clazz))
+ continue;
+ final int fieldModifiers = field.getModifiers();
+ if (Modifier.isPrivate((fieldModifiers)))
+ continue;
+ if (Modifier.isFinal(fieldModifiers))
+ continue;
+ if (Modifier.isStatic(fieldModifiers))
+ continue;
+ field.set(result,field.get(this));
+ }
+ return result;
+ } catch (final Exception ex) {
+ throw new IllegalStateException(ex);
+ }
+ }
+
+ /**
+ * Creates a copy of this configuration.
+ * @return never {@code null}.
+ */
+ @Override
+ public StandardCallerArgumentCollection clone() {
+ try {
+ return (StandardCallerArgumentCollection) super.clone();
+ } catch (CloneNotSupportedException e) {
+ throw new IllegalStateException("unreachable code");
+ }
}
}
diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidySNPGenotypeLikelihoodsCalculationModel.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidySNPGenotypeLikelihoodsCalculationModel.java
index 9ea027698..c3412e2bc 100644
--- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidySNPGenotypeLikelihoodsCalculationModel.java
+++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidySNPGenotypeLikelihoodsCalculationModel.java
@@ -78,6 +78,7 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.BaseUtils;
+import org.broadinstitute.sting.utils.gga.GenotypingGivenAllelesUtils;
import org.broadinstitute.variant.variantcontext.*;
import java.util.*;
@@ -133,8 +134,8 @@ public class GeneralPloidySNPGenotypeLikelihoodsCalculationModel extends General
final List alleles = new ArrayList();
if ( allAllelesToUse != null ) {
alleles.addAll(allAllelesToUse);
- } else if ( UAC.GenotypingMode == GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) {
- final VariantContext vc = UnifiedGenotyperEngine.getVCFromAllelesRod(tracker, ref, ref.getLocus(), true, logger, UAC.alleles);
+ } else if ( UAC.genotypingMode == GenotypingMode.GENOTYPE_GIVEN_ALLELES ) {
+ final VariantContext vc = GenotypingGivenAllelesUtils.composeGivenAllelesVariantContextFromRod(tracker, ref.getLocus(), true, logger, UAC.alleles);
// ignore places where we don't have a SNP
if ( vc == null || !vc.isSNP() )
@@ -150,7 +151,7 @@ public class GeneralPloidySNPGenotypeLikelihoodsCalculationModel extends General
if ( alleles.size() == 1 ) {
final int indexOfRefBase = BaseUtils.simpleBaseToBaseIndex(ref.getBase());
// if we only want variants, then we don't need to calculate genotype likelihoods
- if ( UAC.OutputMode != UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY )
+ if ( UAC.outputMode != OutputMode.EMIT_VARIANTS_ONLY )
// otherwise, choose any alternate allele (it doesn't really matter)
alleles.add(Allele.create(BaseUtils.baseIndexToSimpleBase(indexOfRefBase == 0 ? 1 : 0)));
}
diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java
index 95d3fb78b..dc5220e6a 100644
--- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java
+++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java
@@ -66,30 +66,17 @@ import java.util.Map;
/**
* The model representing how we calculate genotype likelihoods
*/
-public abstract class GenotypeLikelihoodsCalculationModel implements Cloneable {
+public abstract class GenotypeLikelihoodsCalculationModel {
public static final String DUMMY_LANE = "Lane1";
public static final String DUMMY_SAMPLE_NAME = "DummySample1";
- /* public enum Model {
- SNP,
- INDEL,
- BOTH
- }
- */
- public enum Model {
+ public enum Name {
SNP,
INDEL,
GENERALPLOIDYSNP,
GENERALPLOIDYINDEL,
- BOTH
- }
-
- public enum GENOTYPING_MODE {
- /** the Unified Genotyper will choose the most likely alternate allele */
- DISCOVERY,
- /** only the alleles passed in from a VCF rod bound to the -alleles argument will be used for genotyping */
- GENOTYPE_GIVEN_ALLELES
+ BOTH;
}
protected final UnifiedArgumentCollection UAC;
diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypingEngine.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypingEngine.java
new file mode 100644
index 000000000..c0dcfd54e
--- /dev/null
+++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypingEngine.java
@@ -0,0 +1,637 @@
+/*
+* By downloading the PROGRAM you agree to the following terms of use:
+*
+* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
+*
+* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
+*
+* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
+* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
+* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
+*
+* 1. DEFINITIONS
+* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
+*
+* 2. LICENSE
+* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
+* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
+* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
+* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
+*
+* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
+* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
+* Copyright 2012 Broad Institute, Inc.
+* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
+* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
+*
+* 4. INDEMNIFICATION
+* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
+*
+* 5. NO REPRESENTATIONS OR WARRANTIES
+* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
+* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
+*
+* 6. ASSIGNMENT
+* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
+*
+* 7. MISCELLANEOUS
+* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
+* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
+* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
+* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
+* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
+* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
+* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
+*/
+
+package org.broadinstitute.sting.gatk.walkers.genotyper;
+
+import com.google.java.contract.Ensures;
+import com.google.java.contract.Requires;
+import org.apache.log4j.Logger;
+import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
+import org.broadinstitute.sting.gatk.arguments.StandardCallerArgumentCollection;
+import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
+import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils;
+import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
+import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
+import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine;
+import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.AFCalc;
+import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.AFCalcFactory;
+import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.AFCalcResult;
+import org.broadinstitute.sting.utils.GenomeLoc;
+import org.broadinstitute.sting.utils.GenomeLocParser;
+import org.broadinstitute.sting.utils.MathUtils;
+import org.broadinstitute.sting.utils.QualityUtils;
+import org.broadinstitute.sting.utils.exceptions.UserException;
+import org.broadinstitute.sting.utils.gga.GenotypingGivenAllelesUtils;
+import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
+import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils;
+import org.broadinstitute.variant.variantcontext.*;
+import org.broadinstitute.variant.vcf.VCFConstants;
+
+import java.util.*;
+
+/**
+ * Base class for genotyper engines.
+ *
+ * @author Valentin Ruano-Rubio <valentin@broadinstitute.org>
+ */
+public abstract class GenotypingEngine {
+
+ public static final String LOW_QUAL_FILTER_NAME = "LowQual";
+
+ protected Logger logger;
+
+ protected final GenomeAnalysisEngine toolkit;
+
+ protected final Config configuration;
+
+ protected final VariantAnnotatorEngine annotationEngine;
+
+ protected final int numberOfGenomes;
+
+ protected final Collection sampleNames;
+
+
+ private final double[] log10AlleleFrequencyPriorsSNPs;
+
+ private final double[] log10AlleleFrequencyPriorsIndels;
+
+ private final GenomeLocParser genomeLocParser;
+
+ // the model used for calculating p(non-ref)
+ protected ThreadLocal afcm = new ThreadLocal() {
+
+ @Override
+ public AFCalc initialValue() {
+ return AFCalcFactory.createAFCalc(configuration, numberOfGenomes, logger);
+ }
+ };
+
+ /**
+ * Construct a new genotyper engine.
+ *
+ * @param toolkit reference to the genome-analysis toolkit.
+ * @param configuration engine configuration object.
+ * @param annotationEngine reference to the annotation engine. If {@code null}, no annotations will be processed.
+ * @param sampleNames subset of sample to work on identified by their names. If {@code null}, the full toolkit
+ * sample set will be used instead.
+ *
+ * @throws IllegalArgumentException if either {@code toolkit} or {@code configuration} is {@code null}.
+ */
+ public GenotypingEngine(final GenomeAnalysisEngine toolkit, final Config configuration,
+ final VariantAnnotatorEngine annotationEngine, final Set sampleNames) {
+ if (toolkit == null)
+ throw new IllegalArgumentException("the toolkit cannot be null");
+ if (configuration == null)
+ throw new IllegalArgumentException("the configuration cannot be null");
+ this.configuration = configuration;
+ this.annotationEngine = annotationEngine;
+ logger = Logger.getLogger(getClass());
+ this.toolkit = toolkit;
+ this.sampleNames = sampleNames != null ? sampleNames : toolkit.getSampleDB().getSampleNames();
+ numberOfGenomes = this.sampleNames.size() * configuration.samplePloidy;
+ log10AlleleFrequencyPriorsSNPs = computeAlleleFrequencyPriors(numberOfGenomes,
+ configuration.snpHeterozygosity,configuration.inputPrior);
+ log10AlleleFrequencyPriorsIndels = computeAlleleFrequencyPriors(numberOfGenomes,
+ configuration.indelHeterozygosity,configuration.inputPrior);
+ genomeLocParser = toolkit.getGenomeLocParser();
+ }
+
+ /**
+ * Changes the logger for this genotyper engine.
+ * @param logger new logger.
+ *
+ * @throws IllegalArgumentException if {@code logger} is {@code null}.
+ */
+ public void setLogger(final Logger logger) {
+ if (logger == null)
+ throw new IllegalArgumentException("the logger cannot be null");
+ this.logger = logger;
+ }
+
+ /**
+ * Returns a reference to the engine configuration
+ * @return never {@code null}.
+ */
+ public Config getConfiguration() {
+ return configuration;
+ }
+
+ /**
+ * Completes a variant context with genotype calls and associated annotations give the genotype likelihoods and
+ * the model that need to be applied.
+ * @param vc variant-context to complete.
+ * @param modelName model name.
+ *
+ * @return can be {@code null} indicating that genotyping it not possible with the information provided.
+ */
+ public VariantCallContext calculateGenotypes(final VariantContext vc, final GeneralPloidySNPGenotypeLikelihoodsCalculationModel.Name modelName) {
+ if (vc == null)
+ throw new IllegalArgumentException("vc cannot be null");
+ if (modelName == null)
+ throw new IllegalArgumentException("the model cannot be null");
+ return calculateGenotypes(null,null,null,null,vc,modelName,false,null);
+ }
+
+ /**
+ * Main entry function to calculate genotypes of a given VC with corresponding GL's that is shared across genotypers (namely UG and HC).
+ *
+ * @param tracker Tracker
+ * @param refContext Reference context
+ * @param rawContext Raw context
+ * @param stratifiedContexts Stratified alignment contexts
+ * @param vc Input VC
+ * @param model GL calculation model
+ * @param inheritAttributesFromInputVC Output VC will contain attributes inherited from input vc
+ * @return VC with assigned genotypes
+ */
+ protected VariantCallContext calculateGenotypes(final RefMetaDataTracker tracker, final ReferenceContext refContext,
+ final AlignmentContext rawContext, Map stratifiedContexts,
+ final VariantContext vc, final GenotypeLikelihoodsCalculationModel.Name model,
+ final boolean inheritAttributesFromInputVC,
+ final Map perReadAlleleLikelihoodMap) {
+
+ final boolean limitedContext = tracker == null || refContext == null || rawContext == null || stratifiedContexts == null;
+ // if input VC can't be genotyped, exit with either null VCC or, in case where we need to emit all sites, an empty call
+ if (hasTooManyAlternativeAlleles(vc) || vc.getNSamples() == 0)
+ return emptyCallContext(tracker,refContext,rawContext);
+
+ final AFCalcResult AFresult = afcm.get().getLog10PNonRef(vc, getAlleleFrequencyPriors(model));
+
+ final OutputAlleleSubset outputAlternativeAlleles = calculateOutputAlleleSubset(AFresult);
+
+ final double PoFGT0 = Math.pow(10, AFresult.getLog10PosteriorOfAFGT0());
+
+ // note the math.abs is necessary because -10 * 0.0 => -0.0 which isn't nice
+ double log10Confidence =
+ ! outputAlternativeAlleles.siteIsMonomorphic ||
+ configuration.genotypingMode == GenotypingMode.GENOTYPE_GIVEN_ALLELES || configuration.annotateAllSitesWithPLs
+ ? AFresult.getLog10PosteriorOfAFEq0() + 0.0
+ : AFresult.getLog10PosteriorOfAFGT0() + 0.0;
+
+
+ // Add 0.0 removes -0.0 occurrences.
+ final double phredScaledConfidence = (-10.0 * log10Confidence) + 0.0;
+
+ // return a null call if we don't pass the confidence cutoff or the most likely allele frequency is zero
+ if ( !passesEmitThreshold(phredScaledConfidence, outputAlternativeAlleles.siteIsMonomorphic) && !forceSiteEmission())
+ // technically, at this point our confidence in a reference call isn't accurately estimated
+ // because it didn't take into account samples with no data, so let's get a better estimate
+ return limitedContext ? null : estimateReferenceConfidence(vc, stratifiedContexts, getModelTheta(model), true, PoFGT0);
+
+ // start constructing the resulting VC
+ final GenomeLoc loc = genomeLocParser.createGenomeLoc(vc);
+ final List outputAlleles = outputAlternativeAlleles.outputAlleles(vc.getReference());
+ final VariantContextBuilder builder = new VariantContextBuilder(callSourceString(), loc.getContig(), loc.getStart(), loc.getStop(), outputAlleles);
+
+ // Seems that when log10PError is 0.0, you must pass -0.0 to get a nice output at the other end otherwise is a "-0".
+ // Truth is that this should be fixed in the "variant" dependency code but perhaps it can be ammeneded also in the VariantContextWriter.
+ //TODO report this bug or make sure our VCFWriters do not output '-0' QUAL column values.
+ builder.log10PError(log10Confidence == 0.0 ? -0.0 : log10Confidence);
+ if ( ! passesCallThreshold(phredScaledConfidence) )
+ builder.filter(LOW_QUAL_FILTER_NAME);
+
+ // create the genotypes
+ final GenotypesContext genotypes = afcm.get().subsetAlleles(vc, outputAlleles, true,configuration.samplePloidy);
+ builder.genotypes(genotypes);
+
+ // *** note that calculating strand bias involves overwriting data structures, so we do that last
+ final Map attributes = composeCallAttributes(inheritAttributesFromInputVC, vc, rawContext, stratifiedContexts, tracker, refContext,
+ outputAlternativeAlleles.alternativeAlleleMLECounts(), outputAlternativeAlleles.siteIsMonomorphic, AFresult, outputAlternativeAlleles.outputAlleles(vc.getReference()),genotypes,model,perReadAlleleLikelihoodMap);
+
+ builder.attributes(attributes);
+
+ VariantContext vcCall = builder.make();
+
+ if ( annotationEngine != null && !limitedContext ) { // limitedContext callers need to handle annotations on their own by calling their own annotationEngine
+ // Note: we want to use the *unfiltered* and *unBAQed* context for the annotations
+ final ReadBackedPileup pileup = rawContext.getBasePileup();
+ stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(pileup);
+
+ vcCall = annotationEngine.annotateContext(tracker, refContext, stratifiedContexts, vcCall, perReadAlleleLikelihoodMap);
+ }
+
+ // if we are subsetting alleles (either because there were too many or because some were not polymorphic)
+ // then we may need to trim the alleles (because the original VariantContext may have had to pad at the end).
+ if ( outputAlleles.size() != vc.getAlleles().size() && !limitedContext ) // limitedContext callers need to handle allele trimming on their own to keep their perReadAlleleLikelihoodMap alleles in sync
+ vcCall = GATKVariantContextUtils.reverseTrimAlleles(vcCall);
+
+ return new VariantCallContext(vcCall, confidentlyCalled(phredScaledConfidence, PoFGT0));
+ }
+
+ /**
+ * What string to use as source of variant-context generated by this genotyper-engine.
+ * @return never {@code null} nor empty.
+ */
+ protected abstract String callSourceString();
+
+ /**
+ * Holds information about the alternative allele subsetting based on supporting evidence, genotyping and
+ * output modes.
+ */
+ private static class OutputAlleleSubset {
+ private final Allele[] alleles;
+ private final boolean siteIsMonomorphic;
+ private final int[] mleCounts;
+ private final int count;
+
+ @Requires({"count >= 0", "alleles != null", "mleCounts != null", "count <= alleles.length", " count <= mleCounts"})
+ private OutputAlleleSubset(final int count, final Allele[] alleles, final int[] mleCounts, final boolean siteIsMonomorphic) {
+ this.siteIsMonomorphic = siteIsMonomorphic;
+ this.count = count;
+ this.alleles = alleles;
+ this.mleCounts = mleCounts;
+ }
+
+ private List outputAlleles(final Allele referenceAllele) {
+ final ArrayList result = new ArrayList<>(count + 1);
+ result.add(referenceAllele);
+ for (int i = 0; i < count; i++)
+ result.add(alleles[i]);
+ return result;
+ }
+
+ public List alternativeAlleleMLECounts() {
+ final List result = new ArrayList<>(count);
+ for (int i = 0; i < count; i++)
+ result.add(mleCounts[i]);
+ return result;
+ }
+ }
+
+
+ /**
+ * Provided the exact mode computations it returns the appropiate subset of alleles that progress to genotyping.
+ * @param afcr the exact model calcualtion result.
+ * @return never {@code null}.
+ */
+ private OutputAlleleSubset calculateOutputAlleleSubset(final AFCalcResult afcr) {
+ final List alleles = afcr.getAllelesUsedInGenotyping();
+
+ final int alternativeAlleleCount = alleles.size() - 1;
+ Allele[] outputAlleles = new Allele[alternativeAlleleCount];
+ int[] mleCounts = new int[alternativeAlleleCount];
+ int outputAlleleCount = 0;
+ boolean siteIsMonomorphic = true;
+ for (final Allele alternativeAllele : alleles) {
+ if (alternativeAllele.isReference()) continue;
+ final boolean isPlausible = afcr.isPolymorphicPhredScaledQual(alternativeAllele, configuration.STANDARD_CONFIDENCE_FOR_EMITTING);
+ final boolean toOutput = isPlausible || forceKeepAllele(alternativeAllele);
+
+ siteIsMonomorphic &= ! isPlausible;
+ if (!toOutput) continue;
+ outputAlleles[outputAlleleCount] = alternativeAllele;
+ mleCounts[outputAlleleCount++] = afcr.getAlleleCountAtMLE(alternativeAllele);
+ }
+
+ return new OutputAlleleSubset(outputAlleleCount,outputAlleles,mleCounts,siteIsMonomorphic);
+ }
+
+ /**
+ * Checks whether even if the allele is not well supported by the data, we should keep it for genotyping.
+ *
+ * @param allele target allele.
+ *
+ * @return {@code true} iff we need to keep this alleles even if does not seem plausible.
+ */
+ protected abstract boolean forceKeepAllele(final Allele allele);
+
+ /**
+ * Checks whether a variant site seems confidently called base on user threshold that the score provided
+ * by the exact model.
+ *
+ * @param conf
+ * @param PofF
+ * @return {@true} iff the variant is confidently called.
+ */
+ protected final boolean confidentlyCalled(final double conf, final double PofF) {
+ return conf >= configuration.STANDARD_CONFIDENCE_FOR_CALLING ||
+ (configuration.genotypingMode == GenotypingMode.GENOTYPE_GIVEN_ALLELES
+ && QualityUtils.phredScaleErrorRate(PofF) >= configuration.STANDARD_CONFIDENCE_FOR_CALLING);
+ }
+
+ /**
+ * Based in the model used, returns the appropriate heterozygosity argument value.
+ * @param model genotyping model.
+ *
+ * @return a valid heterozygosity in (0,1).
+ */
+ private double getModelTheta(final GenotypeLikelihoodsCalculationModel.Name model) {
+ switch (model) {
+ case SNP:
+ case GENERALPLOIDYSNP:
+ return configuration.snpHeterozygosity;
+ case INDEL:
+ case GENERALPLOIDYINDEL:
+ return configuration.indelHeterozygosity;
+ default:
+ throw new IllegalArgumentException("Unexpected GenotypeCalculationModel " + model);
+ }
+ }
+
+
+ /**
+ * Checks whether the variant context has too many alternative alleles for progress to genotyping the site.
+ *
+ * AF calculation may get intro trouble with too many alternative alleles.
+ *
+ *
+ * @param vc the variant context to evaluate.
+ *
+ * @throws NullPointerException if {@code vc} is {@code null}.
+ *
+ * @return {@code true} iff there is too many alternative alleles based on
+ * {@link GenotypeLikelihoods#MAX_ALT_ALLELES_THAT_CAN_BE_GENOTYPED}.
+ */
+ @Requires("vc != null")
+ protected final boolean hasTooManyAlternativeAlleles(final VariantContext vc) {
+ // protect against too many alternate alleles that we can't even run AF on:
+ if (vc.getNAlleles() <= GenotypeLikelihoods.MAX_ALT_ALLELES_THAT_CAN_BE_GENOTYPED)
+ return false;
+ logger.warn("Attempting to genotype more than "+GenotypeLikelihoods.MAX_ALT_ALLELES_THAT_CAN_BE_GENOTYPED +
+ " alleles. Site will be skipped at location "+vc.getChr()+":"+vc.getStart());
+ return true;
+ }
+
+ /**
+ * Produces an empty variant-call context to output when there is no enough data provided to call anything.
+ *
+ * @param tracker meta-data tracker.
+ * @param ref the reference context.
+ * @param rawContext the read alignment at that location.
+ * @return it might be null if no enough information is provided to do even an empty call. For example when
+ * we have limmited-context (i.e. any of the tracker, reference or alignment is {@code null}.
+ */
+ protected final VariantCallContext emptyCallContext(final RefMetaDataTracker tracker, final ReferenceContext ref,
+ final AlignmentContext rawContext) {
+ if (tracker == null || ref == null || rawContext == null)
+ return null;
+
+ if (!forceSiteEmission())
+ return null;
+
+ VariantContext vc;
+
+ if ( configuration.genotypingMode == GenotypingMode.GENOTYPE_GIVEN_ALLELES ) {
+ final VariantContext ggaVc = GenotypingGivenAllelesUtils.composeGivenAllelesVariantContextFromRod(tracker,
+ rawContext.getLocation(), false, logger, configuration.alleles);
+ if (ggaVc == null)
+ return null;
+ vc = new VariantContextBuilder(callSourceString(), ref.getLocus().getContig(), ggaVc.getStart(),
+ ggaVc.getEnd(), ggaVc.getAlleles()).make();
+ } else {
+ // deal with bad/non-standard reference bases
+ if ( !Allele.acceptableAlleleBases(new byte[]{ref.getBase()}) )
+ return null;
+ final Set alleles = new HashSet<>(Collections.singleton(Allele.create(ref.getBase(),true)));
+ vc = new VariantContextBuilder(callSourceString(), ref.getLocus().getContig(),
+ ref.getLocus().getStart(), ref.getLocus().getStart(), alleles).make();
+ }
+
+ if ( vc != null && annotationEngine != null ) {
+ // Note: we want to use the *unfiltered* and *unBAQed* context for the annotations
+ final ReadBackedPileup pileup = rawContext.getBasePileup();
+ vc = annotationEngine.annotateContext(tracker, ref, AlignmentContextUtils.splitContextBySampleName(pileup), vc);
+ }
+
+ return new VariantCallContext(vc, false);
+ }
+
+ /**
+ * Indicates whether we have to emit any site no matter what.
+ *
+ * Note: this has been added to allow differences between UG and HC GGA modes where the latter force emmitions of all given alleles
+ * sites even if there is no enough confidence.
+ *
+ *
+ * @return {@code true} iff we force emissions.
+ */
+ protected abstract boolean forceSiteEmission();
+
+ protected final VariantCallContext estimateReferenceConfidence(VariantContext vc, Map contexts, double theta, boolean ignoreCoveredSamples, double initialPofRef) {
+ if ( contexts == null )
+ return null;
+
+ double log10POfRef = Math.log10(initialPofRef);
+
+ // for each sample that we haven't examined yet
+ for ( String sample : sampleNames ) {
+ final AlignmentContext context = contexts.get(sample);
+ if ( ignoreCoveredSamples && context != null )
+ continue;
+ final int depth = context == null ? 0 : context.getBasePileup().depthOfCoverage();
+ log10POfRef += estimateLog10ReferenceConfidenceForOneSample(depth, theta);
+ }
+
+ return new VariantCallContext(vc, QualityUtils.phredScaleLog10CorrectRate(log10POfRef) >= configuration.STANDARD_CONFIDENCE_FOR_CALLING, false);
+ }
+
+ protected final double[] getAlleleFrequencyPriors( final GenotypeLikelihoodsCalculationModel.Name model ) {
+ switch (model) {
+ case SNP:
+ case GENERALPLOIDYSNP:
+ return log10AlleleFrequencyPriorsSNPs;
+ case INDEL:
+ case GENERALPLOIDYINDEL:
+ return log10AlleleFrequencyPriorsIndels;
+ default:
+ throw new IllegalArgumentException("Unexpected GenotypeCalculationModel " + model);
+ }
+ }
+
+ /**
+ * Compute the log10 probability of a sample with sequencing depth and no alt allele is actually truly homozygous reference
+ *
+ * Assumes the sample is diploid
+ *
+ * @param depth the depth of the sample
+ * @param theta the heterozygosity of this species (between 0 and 1)
+ * @return a valid log10 probability of the sample being hom-ref
+ */
+ @Requires({"depth >= 0", "theta >= 0.0 && theta <= 1.0"})
+ @Ensures("MathUtils.goodLog10Probability(result)")
+ protected final double estimateLog10ReferenceConfidenceForOneSample(final int depth, final double theta) {
+ final double log10PofNonRef = Math.log10(theta / 2.0) + getRefBinomialProbLog10(depth);
+ return MathUtils.log10OneMinusX(Math.pow(10.0, log10PofNonRef));
+ }
+
+ protected final double getRefBinomialProbLog10(final int depth) {
+ return MathUtils.log10BinomialProbability(depth, 0);
+ }
+
+ /**
+ * Function that fills vector with allele frequency priors. By default, infinite-sites, neutral variation prior is used,
+ * where Pr(AC=i) = theta/i where theta is heterozygosity
+ * @param N Number of chromosomes
+ * @param heterozygosity default heterozygosity to use, if inputPriors is empty
+ * @param inputPriors Input priors to use (in which case heterozygosity is ignored)
+ *
+ * @return never {@code null}.
+ */
+ public static double[] computeAlleleFrequencyPriors(final int N, final double heterozygosity, final List inputPriors) {
+
+ final double[] priors = new double[N + 1];
+ double sum = 0.0;
+
+ if (!inputPriors.isEmpty()) {
+ // user-specified priors
+ if (inputPriors.size() != N)
+ throw new UserException.BadArgumentValue("inputPrior","Invalid length of inputPrior vector: vector length must be equal to # samples +1 ");
+
+ int idx = 1;
+ for (final double prior: inputPriors) {
+ if (prior < 0.0)
+ throw new UserException.BadArgumentValue("Bad argument: negative values not allowed","inputPrior");
+ priors[idx++] = Math.log10(prior);
+ sum += prior;
+ }
+ }
+ else
+ // for each i
+ for (int i = 1; i <= N; i++) {
+ final double value = heterozygosity / (double)i;
+ priors[i] = Math.log10(value);
+ sum += value;
+ }
+
+ // protection against the case of heterozygosity too high or an excessive number of samples (which break population genetics assumptions)
+ if (sum > 1.0)
+ throw new UserException.BadArgumentValue("heterozygosity","The heterozygosity value is set too high relative to the number of samples to be processed, or invalid values specified if input priors were provided - try reducing heterozygosity value or correct input priors.");
+ // null frequency for AF=0 is (1 - sum(all other frequencies))
+ priors[0] = Math.log10(1.0 - sum);
+ return priors;
+ }
+
+ /**
+ * Function that fills vector with allele frequency priors. By default, infinite-sites, neutral variation prior is used,
+ * where Pr(AC=i) = theta/i where theta is heterozygosity
+ * @param N Number of chromosomes
+ * @param priors (output) array to be filled with priors
+ * @param heterozygosity default heterozygosity to use, if inputPriors is empty
+ * @param inputPriors Input priors to use (in which case heterozygosity is ignored)
+ */
+ public static void computeAlleleFrequencyPriors(final int N, final double[] priors, final double heterozygosity, final List inputPriors) {
+
+
+ double sum = 0.0;
+
+ if (!inputPriors.isEmpty()) {
+ // user-specified priors
+ if (inputPriors.size() != N)
+ throw new UserException.BadArgumentValue("inputPrior","Invalid length of inputPrior vector: vector length must be equal to # samples +1 ");
+
+ int idx = 1;
+ for (final double prior: inputPriors) {
+ if (prior < 0.0)
+ throw new UserException.BadArgumentValue("Bad argument: negative values not allowed","inputPrior");
+ priors[idx++] = Math.log10(prior);
+ sum += prior;
+ }
+ }
+ else {
+ // for each i
+ for (int i = 1; i <= N; i++) {
+ final double value = heterozygosity / (double)i;
+ priors[i] = Math.log10(value);
+ sum += value;
+ }
+ }
+
+ // protection against the case of heterozygosity too high or an excessive number of samples (which break population genetics assumptions)
+ if (sum > 1.0) {
+ throw new UserException.BadArgumentValue("heterozygosity","The heterozygosity value is set too high relative to the number of samples to be processed, or invalid values specified if input priors were provided - try reducing heterozygosity value or correct input priors.");
+ }
+ // null frequency for AF=0 is (1 - sum(all other frequencies))
+ priors[0] = Math.log10(1.0 - sum);
+ }
+
+
+ protected final boolean passesEmitThreshold(double conf, boolean bestGuessIsRef) {
+ return (configuration.outputMode == OutputMode.EMIT_ALL_CONFIDENT_SITES || !bestGuessIsRef) &&
+ conf >= Math.min(configuration.STANDARD_CONFIDENCE_FOR_CALLING,
+ configuration.STANDARD_CONFIDENCE_FOR_EMITTING);
+ }
+
+ protected final boolean passesCallThreshold(double conf) {
+ return conf >= configuration.STANDARD_CONFIDENCE_FOR_CALLING;
+ }
+
+ protected Map composeCallAttributes(final boolean inheritAttributesFromInputVC, final VariantContext vc,
+ final AlignmentContext rawContext, final Map stratifiedContexts, final RefMetaDataTracker tracker, final ReferenceContext refContext, final List alleleCountsofMLE, final boolean bestGuessIsRef,
+ final AFCalcResult AFresult, final List allAllelesToUse, final GenotypesContext genotypes,
+ final GenotypeLikelihoodsCalculationModel.Name model, final Map perReadAlleleLikelihoodMap) {
+ final HashMap attributes = new HashMap<>();
+
+
+ final boolean limitedContext = tracker == null || refContext == null || rawContext == null || stratifiedContexts == null;
+
+
+ // inherit attributed from input vc if requested
+ if (inheritAttributesFromInputVC)
+ attributes.putAll(vc.getAttributes());
+ // if the site was downsampled, record that fact
+ if ( !limitedContext && rawContext.hasPileupBeenDownsampled() )
+ attributes.put(VCFConstants.DOWNSAMPLED_KEY, true);
+
+
+ // add the MLE AC and AF annotations
+ if ( alleleCountsofMLE.size() > 0 ) {
+ attributes.put(VCFConstants.MLE_ALLELE_COUNT_KEY, alleleCountsofMLE);
+ int AN = 0;
+ for (final Genotype g : genotypes) {
+ for (final Allele a : g.getAlleles())
+ if (!a.isNoCall()) AN++;
+ }
+ final ArrayList MLEfrequencies = new ArrayList(alleleCountsofMLE.size());
+ // the MLEAC is allowed to be larger than the AN (e.g. in the case of all PLs being 0, the GT is ./. but the exact model may arbitrarily choose an AC>1)
+ for ( int AC : alleleCountsofMLE )
+ MLEfrequencies.add(Math.min(1.0, (double)AC / (double)AN));
+ attributes.put(VCFConstants.MLE_ALLELE_FREQUENCY_KEY, MLEfrequencies);
+ }
+
+ return attributes;
+ }
+
+}
diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypingMode.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypingMode.java
new file mode 100644
index 000000000..f6fc87c5a
--- /dev/null
+++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypingMode.java
@@ -0,0 +1,68 @@
+/*
+* By downloading the PROGRAM you agree to the following terms of use:
+*
+* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
+*
+* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
+*
+* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
+* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
+* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
+*
+* 1. DEFINITIONS
+* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
+*
+* 2. LICENSE
+* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
+* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
+* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
+* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
+*
+* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
+* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
+* Copyright 2012 Broad Institute, Inc.
+* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
+* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
+*
+* 4. INDEMNIFICATION
+* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
+*
+* 5. NO REPRESENTATIONS OR WARRANTIES
+* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
+* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
+*
+* 6. ASSIGNMENT
+* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
+*
+* 7. MISCELLANEOUS
+* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
+* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
+* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
+* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
+* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
+* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
+* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
+*/
+package org.broadinstitute.sting.gatk.walkers.genotyper;
+
+/**
+ * Enumeration of possible genotyping modes.
+ *
+ *
+ * A genotyping mode represents the general approach taken when detecting and calculate variant genotypes.
+ *
+ *
+ * @author Valentin Ruano-Rubio <valentin@broadinstitute.org>
+ */
+public enum GenotypingMode {
+
+ /**
+ * The genotyper will choose the most likely alternate allele
+ */
+ DISCOVERY,
+
+ /**
+ * Only the alleles passed by the user should be considered.
+ */
+ GENOTYPE_GIVEN_ALLELES
+}
diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java
index ae2ea427b..4a0bf371e 100644
--- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java
+++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java
@@ -215,7 +215,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
final boolean ignoreSNPAllelesWhenGenotypingIndels) {
List alleles = new ArrayList();
- if (UAC.GenotypingMode == GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES) {
+ if (UAC.genotypingMode == GenotypingMode.GENOTYPE_GIVEN_ALLELES) {
VariantContext vc = null;
for (final VariantContext vc_input : tracker.getValues(UAC.alleles, ref.getLocus())) {
if (vc_input != null &&
diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/OutputMode.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/OutputMode.java
new file mode 100644
index 000000000..72918e914
--- /dev/null
+++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/OutputMode.java
@@ -0,0 +1,62 @@
+/*
+* By downloading the PROGRAM you agree to the following terms of use:
+*
+* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
+*
+* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
+*
+* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
+* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
+* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
+*
+* 1. DEFINITIONS
+* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
+*
+* 2. LICENSE
+* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
+* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
+* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
+* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
+*
+* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
+* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
+* Copyright 2012 Broad Institute, Inc.
+* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
+* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
+*
+* 4. INDEMNIFICATION
+* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
+*
+* 5. NO REPRESENTATIONS OR WARRANTIES
+* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
+* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
+*
+* 6. ASSIGNMENT
+* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
+*
+* 7. MISCELLANEOUS
+* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
+* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
+* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
+* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
+* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
+* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
+* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
+*/
+package org.broadinstitute.sting.gatk.walkers.genotyper;
+
+/**
+* TODO document this.
+*
+* @author Valentin Ruano-Rubio <valentin@broadinstitute.org>
+*/
+public enum OutputMode {
+ /** produces calls only at variant sites */
+ EMIT_VARIANTS_ONLY,
+ /** produces calls at variant sites and confident reference sites */
+ EMIT_ALL_CONFIDENT_SITES,
+ /** produces calls at any callable site regardless of confidence; this argument is intended only for point
+ * mutations (SNPs) in DISCOVERY mode or generally when running in GENOTYPE_GIVEN_ALLELES mode; it will by
+ * no means produce a comprehensive set of indels in DISCOVERY mode */
+ EMIT_ALL_SITES
+}
diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java
index f94baf09f..5ec3ac494 100644
--- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java
+++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java
@@ -59,6 +59,7 @@ import org.broadinstitute.sting.utils.baq.BAQ;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
import org.broadinstitute.sting.utils.genotyper.DiploidGenotype;
+import org.broadinstitute.sting.utils.gga.GenotypingGivenAllelesUtils;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl;
@@ -76,7 +77,7 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
protected SNPGenotypeLikelihoodsCalculationModel(UnifiedArgumentCollection UAC, Logger logger) {
super(UAC, logger);
- useAlleleFromVCF = UAC.GenotypingMode == GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES;
+ useAlleleFromVCF = UAC.genotypingMode == GenotypingMode.GENOTYPE_GIVEN_ALLELES;
perReadAlleleLikelihoodMap = new PerReadAlleleLikelihoodMap();
}
@@ -127,7 +128,7 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
if ( allAllelesToUse != null ) {
alleles.addAll(allAllelesToUse.subList(1,allAllelesToUse.size())); // this includes ref allele
} else if ( useAlleleFromVCF ) {
- final VariantContext vc = UnifiedGenotyperEngine.getVCFromAllelesRod(tracker, ref, ref.getLocus(), true, logger, UAC.alleles);
+ final VariantContext vc = GenotypingGivenAllelesUtils.composeGivenAllelesVariantContextFromRod(tracker, ref.getLocus(), true, logger, UAC.alleles);
// ignore places where we don't have a SNP
if ( vc == null || !vc.isSNP() )
@@ -145,7 +146,7 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
// if there are no non-ref alleles...
if ( alleles.size() == 1 ) {
// if we only want variants, then we don't need to calculate genotype likelihoods
- if ( UAC.OutputMode == UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY )
+ if ( UAC.outputMode == OutputMode.EMIT_VARIANTS_ONLY )
return builder.make();
else
@@ -193,7 +194,7 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC
gb.PL(genotypeLikelihoods);
gb.DP(sampleData.depth);
if (UAC.annotateAllSitesWithPLs)
- gb.attribute(UnifiedGenotyperEngine.PL_FOR_ALL_SNP_ALLELES_KEY,GenotypeLikelihoods.fromLog10Likelihoods(MathUtils.normalizeFromLog10(allLikelihoods, false, true)));
+ gb.attribute(UnifiedGenotypingEngine.PL_FOR_ALL_SNP_ALLELES_KEY,GenotypeLikelihoods.fromLog10Likelihoods(MathUtils.normalizeFromLog10(allLikelihoods, false, true)));
genotypes.add(gb.make());
}
diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java
index 04c5587c3..64960200a 100644
--- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java
+++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java
@@ -49,16 +49,12 @@ package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.broadinstitute.sting.commandline.*;
import org.broadinstitute.sting.gatk.arguments.StandardCallerArgumentCollection;
import org.broadinstitute.sting.utils.pairhmm.PairHMM;
-import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils;
import org.broadinstitute.variant.variantcontext.VariantContext;
-import java.util.Collections;
-import java.util.List;
-
public class UnifiedArgumentCollection extends StandardCallerArgumentCollection {
@Argument(fullName = "genotype_likelihoods_model", shortName = "glm", doc = "Genotype likelihoods calculation model to employ -- SNP is the default option, while INDEL is also available for calling indels and BOTH is available for calling both together", required = false)
- public GenotypeLikelihoodsCalculationModel.Model GLmodel = GenotypeLikelihoodsCalculationModel.Model.SNP;
+ public GenotypeLikelihoodsCalculationModel.Name GLmodel = GenotypeLikelihoodsCalculationModel.Name.SNP;
/**
* The PCR error rate is independent of the sequencing error rate, which is necessary because we cannot necessarily
@@ -102,18 +98,6 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection
@Argument(fullName = "max_deletion_fraction", shortName = "deletions", doc = "Maximum fraction of reads with deletions spanning this locus for it to be callable", required = false)
public Double MAX_DELETION_FRACTION = 0.05;
- /**
- * Advanced, experimental argument: if SNP likelihood model is specified, and if EMIT_ALL_SITES output mode is set, when we set this argument then we will also emit PLs at all sites.
- * This will give a measure of reference confidence and a measure of which alt alleles are more plausible (if any).
- * WARNINGS:
- * - This feature will inflate VCF file size considerably.
- * - All SNP ALT alleles will be emitted with corresponding 10 PL values.
- * - An error will be emitted if EMIT_ALL_SITES is not set, or if anything other than diploid SNP model is used
- */
- @Advanced
- @Argument(fullName = "allSitePLs", shortName = "allSitePLs", doc = "Annotate all sites with PLs", required = false)
- public boolean annotateAllSitesWithPLs = false;
-
// indel-related arguments
/**
* A candidate indel is genotyped (and potentially called) if there are this number of reads with a consensus indel at a site.
@@ -184,13 +168,6 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection
@Argument(shortName="refsample", fullName="reference_sample_name", doc="Reference sample name.", required=false)
String referenceSampleName;
- /*
- Sample ploidy - equivalent to number of chromosomes per pool. In pooled experiments this should be = # of samples in pool * individual sample ploidy
- */
- @Argument(shortName="ploidy", fullName="sample_ploidy", doc="Plody (number of chromosomes) per sample. For pooled data, set to (Number of samples in each pool * Sample Ploidy).", required=false)
- public int samplePloidy = GATKVariantContextUtils.DEFAULT_PLOIDY;
-
-
/**
* The following argument are for debug-only tweaks when running generalized ploidy with a reference sample
*/
@@ -218,9 +195,6 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection
@Argument(shortName="ef", fullName="exclude_filtered_reference_sites", doc="Don't include in the analysis sites where the reference sample VCF is filtered. Default: false.", required=false)
boolean EXCLUDE_FILTERED_REFERENCE_SITES = false;
- @Argument(fullName = "output_mode", shortName = "out_mode", doc = "Specifies which type of calls we should output", required = false)
- public UnifiedGenotyperEngine.OUTPUT_MODE OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY;
-
/**
* Create a new UAC with defaults for all UAC arguments
*/
@@ -228,51 +202,8 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection
super();
}
- /**
- * Create a new UAC based on the information only our in super-class scac and defaults for all UAC arguments
- * @param scac
- */
- public UnifiedArgumentCollection(final StandardCallerArgumentCollection scac) {
- super(scac);
- }
-
- /**
- * Create a new UAC with all parameters having the values in uac
- *
- * @param uac
- */
- public UnifiedArgumentCollection(final UnifiedArgumentCollection uac) {
- // Developers must remember to add any newly added arguments to the list here as well otherwise they won't get changed from their default value!
- super(uac);
-
- this.GLmodel = uac.GLmodel;
- this.AFmodel = uac.AFmodel;
- this.PCR_error = uac.PCR_error;
- this.COMPUTE_SLOD = uac.COMPUTE_SLOD;
- this.ANNOTATE_NUMBER_OF_ALLELES_DISCOVERED = uac.ANNOTATE_NUMBER_OF_ALLELES_DISCOVERED;
- this.MIN_BASE_QUALTY_SCORE = uac.MIN_BASE_QUALTY_SCORE;
- this.MAX_DELETION_FRACTION = uac.MAX_DELETION_FRACTION;
- this.MIN_INDEL_COUNT_FOR_GENOTYPING = uac.MIN_INDEL_COUNT_FOR_GENOTYPING;
- this.MIN_INDEL_FRACTION_PER_SAMPLE = uac.MIN_INDEL_FRACTION_PER_SAMPLE;
- this.INDEL_GAP_OPEN_PENALTY = uac.INDEL_GAP_OPEN_PENALTY;
- this.INDEL_GAP_CONTINUATION_PENALTY = uac.INDEL_GAP_CONTINUATION_PENALTY;
- this.OUTPUT_DEBUG_INDEL_INFO = uac.OUTPUT_DEBUG_INDEL_INFO;
- this.INDEL_HAPLOTYPE_SIZE = uac.INDEL_HAPLOTYPE_SIZE;
- this.TREAT_ALL_READS_AS_SINGLE_POOL = uac.TREAT_ALL_READS_AS_SINGLE_POOL;
- this.referenceSampleRod = uac.referenceSampleRod;
- this.referenceSampleName = uac.referenceSampleName;
- this.samplePloidy = uac.samplePloidy;
- this.maxQualityScore = uac.minQualityScore;
- this.phredScaledPrior = uac.phredScaledPrior;
- this.minPower = uac.minPower;
- this.minReferenceDepth = uac.minReferenceDepth;
- this.EXCLUDE_FILTERED_REFERENCE_SITES = uac.EXCLUDE_FILTERED_REFERENCE_SITES;
- this.IGNORE_LANE_INFO = uac.IGNORE_LANE_INFO;
- this.pairHMM = uac.pairHMM;
- this.OutputMode = uac.OutputMode;
-
- this.annotateAllSitesWithPLs = uac.annotateAllSitesWithPLs;
- // todo- arguments to remove
- this.IGNORE_SNP_ALLELES = uac.IGNORE_SNP_ALLELES;
+ @Override
+ public UnifiedArgumentCollection clone() {
+ return (UnifiedArgumentCollection) super.clone();
}
}
diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java
index 17d0217f0..227fd3f20 100644
--- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java
+++ b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java
@@ -216,7 +216,7 @@ public class UnifiedGenotyper extends LocusWalker, Unif
protected String[] annotationClassesToUse = { "Standard" };
// the calculation arguments
- private UnifiedGenotyperEngine UG_engine = null;
+ private UnifiedGenotypingEngine genotypingEngine = null;
// the annotation engine
private VariantAnnotatorEngine annotationEngine;
@@ -273,9 +273,9 @@ public class UnifiedGenotyper extends LocusWalker, Unif
throw new UserException.BadArgumentValue("max_alternate_alleles", "the maximum possible value is " + GenotypeLikelihoods.MAX_ALT_ALLELES_THAT_CAN_BE_GENOTYPED);
// warn the user for misusing EMIT_ALL_SITES
- if ( UAC.OutputMode == UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_ALL_SITES &&
- UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY &&
- UAC.GLmodel != GenotypeLikelihoodsCalculationModel.Model.SNP )
+ if ( UAC.outputMode == OutputMode.EMIT_ALL_SITES &&
+ UAC.genotypingMode == GenotypingMode.DISCOVERY &&
+ UAC.GLmodel != GenotypeLikelihoodsCalculationModel.Name.SNP )
logger.warn("WARNING: note that the EMIT_ALL_SITES option is intended only for point mutations (SNPs) in DISCOVERY mode or generally when running in GENOTYPE_GIVEN_ALLELES mode; it will by no means produce a comprehensive set of indels in DISCOVERY mode");
// initialize the verbose writer
@@ -283,7 +283,7 @@ public class UnifiedGenotyper extends LocusWalker, Unif
verboseWriter.println("AFINFO\tLOC\tREF\tALT\tMAF\tF\tAFprior\tMLE\tMAP");
annotationEngine = new VariantAnnotatorEngine(Arrays.asList(annotationClassesToUse), annotationsToUse, annotationsToExclude, this, getToolkit());
- UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, verboseWriter, annotationEngine, samples, UAC.samplePloidy);
+ genotypingEngine = new UnifiedGenotypingEngine(getToolkit(), UAC, annotationEngine,samples, verboseWriter);
// initialize the header
Set headerInfo = getHeaderInfo(UAC, annotationEngine, dbsnp);
@@ -318,7 +318,7 @@ public class UnifiedGenotyper extends LocusWalker, Unif
VCFStandardHeaderLines.addStandardInfoLines(headerInfo, true, VCFConstants.STRAND_BIAS_KEY);
if ( UAC.ANNOTATE_NUMBER_OF_ALLELES_DISCOVERED )
- headerInfo.add(new VCFInfoHeaderLine(UnifiedGenotyperEngine.NUMBER_OF_DISCOVERED_ALLELES_KEY, 1, VCFHeaderLineType.Integer, "Number of alternate alleles discovered (but not necessarily genotyped) at this site"));
+ headerInfo.add(new VCFInfoHeaderLine(UnifiedGenotypingEngine.NUMBER_OF_DISCOVERED_ALLELES_KEY, 1, VCFHeaderLineType.Integer, "Number of alternate alleles discovered (but not necessarily genotyped) at this site"));
// add the pool values for each genotype
if (UAC.samplePloidy != GATKVariantContextUtils.DEFAULT_PLOIDY) {
@@ -330,7 +330,7 @@ public class UnifiedGenotyper extends LocusWalker, Unif
}
if (UAC.annotateAllSitesWithPLs) {
- headerInfo.add(new VCFFormatHeaderLine(UnifiedGenotyperEngine.PL_FOR_ALL_SNP_ALLELES_KEY, 10, VCFHeaderLineType.Integer, "Phred-scaled genotype likelihoods for all 4 possible bases regardless of whether there is statistical evidence for them. Ordering is always PL for AA AC CC GA GC GG TA TC TG TT."));
+ headerInfo.add(new VCFFormatHeaderLine(UnifiedGenotypingEngine.PL_FOR_ALL_SNP_ALLELES_KEY, 10, VCFHeaderLineType.Integer, "Phred-scaled genotype likelihoods for all 4 possible bases regardless of whether there is statistical evidence for them. Ordering is always PL for AA AC CC GA GC GG TA TC TG TT."));
}
VCFStandardHeaderLines.addStandardInfoLines(headerInfo, true,
VCFConstants.DOWNSAMPLED_KEY,
@@ -350,7 +350,7 @@ public class UnifiedGenotyper extends LocusWalker, Unif
// FILTER fields are added unconditionally as it's not always 100% certain the circumstances
// where the filters are used. For example, in emitting all sites the lowQual field is used
- headerInfo.add(new VCFFilterHeaderLine(UnifiedGenotyperEngine.LOW_QUAL_FILTER_NAME, "Low quality"));
+ headerInfo.add(new VCFFilterHeaderLine(UnifiedGenotypingEngine.LOW_QUAL_FILTER_NAME, "Low quality"));
return headerInfo;
}
@@ -364,7 +364,7 @@ public class UnifiedGenotyper extends LocusWalker, Unif
* @return the VariantCallContext object
*/
public List map(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext rawContext) {
- return UG_engine.calculateLikelihoodsAndGenotypes(tracker, refContext, rawContext, samples);
+ return genotypingEngine.calculateLikelihoodsAndGenotypes(tracker, refContext, rawContext);
}
public UGStatistics reduceInit() { return new UGStatistics(); }
diff --git a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java
deleted file mode 100644
index c5070a76f..000000000
--- a/protected/gatk-protected/src/main/java/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java
+++ /dev/null
@@ -1,844 +0,0 @@
-/*
-* By downloading the PROGRAM you agree to the following terms of use:
-*
-* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
-*
-* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
-*
-* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
-* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
-* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
-*
-* 1. DEFINITIONS
-* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
-*
-* 2. LICENSE
-* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
-* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
-* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
-* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
-*
-* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
-* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
-* Copyright 2012 Broad Institute, Inc.
-* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
-* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
-*
-* 4. INDEMNIFICATION
-* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
-*
-* 5. NO REPRESENTATIONS OR WARRANTIES
-* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
-* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
-*
-* 6. ASSIGNMENT
-* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
-*
-* 7. MISCELLANEOUS
-* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
-* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
-* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
-* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
-* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
-* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
-* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
-*/
-
-package org.broadinstitute.sting.gatk.walkers.genotyper;
-
-import com.google.java.contract.Ensures;
-import com.google.java.contract.Requires;
-import org.apache.log4j.Logger;
-import org.broadinstitute.sting.commandline.RodBinding;
-import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
-import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
-import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils;
-import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
-import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
-import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine;
-import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.AFCalc;
-import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.AFCalcFactory;
-import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.AFCalcResult;
-import org.broadinstitute.sting.utils.*;
-import org.broadinstitute.sting.utils.baq.BAQ;
-import org.broadinstitute.sting.utils.classloader.PluginManager;
-import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils;
-import org.broadinstitute.sting.utils.BaseUtils;
-import org.broadinstitute.variant.vcf.VCFConstants;
-import org.broadinstitute.sting.utils.exceptions.UserException;
-import org.broadinstitute.sting.utils.pileup.PileupElement;
-import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
-import org.broadinstitute.variant.variantcontext.*;
-
-import java.io.PrintStream;
-import java.lang.reflect.Constructor;
-import java.util.*;
-
-public class UnifiedGenotyperEngine {
- public static final String LOW_QUAL_FILTER_NAME = "LowQual";
- private static final String GPSTRING = "GENERALPLOIDY";
-
- public static final String NUMBER_OF_DISCOVERED_ALLELES_KEY = "NDA";
- public static final String PL_FOR_ALL_SNP_ALLELES_KEY = "APL";
-
- public static final double HUMAN_SNP_HETEROZYGOSITY = 1e-3;
- public static final double HUMAN_INDEL_HETEROZYGOSITY = 1e-4;
-
- private static final int SNP_MODEL = 0;
- private static final int INDEL_MODEL = 1;
-
- public enum OUTPUT_MODE {
- /** produces calls only at variant sites */
- EMIT_VARIANTS_ONLY,
- /** produces calls at variant sites and confident reference sites */
- EMIT_ALL_CONFIDENT_SITES,
- /** produces calls at any callable site regardless of confidence; this argument is intended only for point
- * mutations (SNPs) in DISCOVERY mode or generally when running in GENOTYPE_GIVEN_ALLELES mode; it will by
- * no means produce a comprehensive set of indels in DISCOVERY mode */
- EMIT_ALL_SITES
- }
-
- // the unified argument collection
- private final UnifiedArgumentCollection UAC;
- public UnifiedArgumentCollection getUAC() { return UAC; }
-
- // the annotation engine
- private final VariantAnnotatorEngine annotationEngine;
-
- // the model used for calculating genotypes
- private ThreadLocal