diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/AFCalculationResult.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/AFCalculationResult.java index 8f38456d2..b1286480c 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/AFCalculationResult.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/AFCalculationResult.java @@ -102,7 +102,7 @@ public class AFCalculationResult { final Map log10pRefByAllele) { if ( allelesUsedInGenotyping == null || allelesUsedInGenotyping.size() < 1 ) throw new IllegalArgumentException("allelesUsedInGenotyping must be non-null list of at least 1 value " + allelesUsedInGenotyping); if ( alleleCountsOfMLE == null ) throw new IllegalArgumentException("alleleCountsOfMLE cannot be null"); - if ( alleleCountsOfMLE.length != allelesUsedInGenotyping.size() - 1) throw new IllegalArgumentException("alleleCountsOfMLE.length " + alleleCountsOfMLE.length + " != allelesUsedInGenotyping.size() " + allelesUsedInGenotyping.size()); + if ( alleleCountsOfMLE.length != allelesUsedInGenotyping.size() - 1) throw new IllegalArgumentException("alleleCountsOfMLE.length " + alleleCountsOfMLE.length + " != number of alternate alleles used in genotyping " + (allelesUsedInGenotyping.size() - 1)); if ( nEvaluations < 0 ) throw new IllegalArgumentException("nEvaluations must be >= 0 but saw " + nEvaluations); if ( log10LikelihoodsOfAC.length != 2 ) throw new IllegalArgumentException("log10LikelihoodsOfAC must have length equal 2"); if ( log10PriorsOfAC.length != 2 ) throw new IllegalArgumentException("log10PriorsOfAC must have length equal 2"); diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/AFCalculatorImplementation.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/AFCalculatorImplementation.java index 9058edce4..f4c188374 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/AFCalculatorImplementation.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/AFCalculatorImplementation.java @@ -65,7 +65,7 @@ import java.util.Map; */ public enum AFCalculatorImplementation { - /** default implementation */ + /** Fast implementation for multi-allelics (equivalent to {@link #EXACT_REFERENCE} for biallelics sites */ EXACT_INDEPENDENT(IndependentAllelesDiploidExactAFCalculator.class, 2), /** reference implementation of multi-allelic EXACT model. Extremely slow for many alternate alleles */ @@ -75,7 +75,12 @@ public enum AFCalculatorImplementation { EXACT_ORIGINAL(OriginalDiploidExactAFCalculator.class, 2, 2), /** implementation that supports any sample ploidy. Currently not available for the HaplotypeCaller */ - EXACT_GENERAL_PLOIDY(GeneralPloidyExactAFCalculator.class); + EXACT_GENERAL_PLOIDY(GeneralPloidyExactAFCalculator.class), + + /** + * Implementation that implements the {@link #EXACT_INDEPENDENT} for any ploidy. + */ + EXACT_GENERAL_INDEPENDENT(IndependentAllelesExactAFCalculator.class); /** * Special max alt allele count indicating that this maximum is in fact unbound (can be anything). @@ -180,7 +185,7 @@ public enum AFCalculatorImplementation { } /** - * Creates new instance + * Creates new instance. * * @throws IllegalStateException if the instance could not be create due to some exception. The {@link Exception#getCause() cause} will hold a reference to the actual exception. * @return never {@code null}. @@ -205,11 +210,14 @@ public enum AFCalculatorImplementation { final AFCalculatorImplementation preferredValue = preferred == null ? DEFAULT : preferred; if (preferredValue.usableForParams(requiredPloidy,requiredAlternativeAlleleCount)) return preferredValue; - if (EXACT_INDEPENDENT.usableForParams(requiredPloidy,requiredAlternativeAlleleCount)) + else if (EXACT_INDEPENDENT.usableForParams(requiredPloidy,requiredAlternativeAlleleCount)) return EXACT_INDEPENDENT; - if (EXACT_REFERENCE.usableForParams(requiredPloidy,requiredAlternativeAlleleCount)) + else if (EXACT_REFERENCE.usableForParams(requiredPloidy,requiredAlternativeAlleleCount)) return EXACT_REFERENCE; - return EXACT_GENERAL_PLOIDY; + else if (EXACT_GENERAL_INDEPENDENT.usableForParams(requiredPloidy,requiredAlternativeAlleleCount)) + return EXACT_GENERAL_INDEPENDENT; + else + return EXACT_GENERAL_PLOIDY; } /** diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/GeneralPloidyFailOverAFCalculatorProvider.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/GeneralPloidyFailOverAFCalculatorProvider.java index afa4501f6..bc62af664 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/GeneralPloidyFailOverAFCalculatorProvider.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/GeneralPloidyFailOverAFCalculatorProvider.java @@ -81,7 +81,7 @@ public class GeneralPloidyFailOverAFCalculatorProvider extends AFCalculatorProvi preferredImplementation = AFCalculatorImplementation.bestValue(genotypeArgs.samplePloidy,genotypeArgs.MAX_ALTERNATE_ALLELES, null); preferred = preferredImplementation.newInstance(); preferred.setLogger(logger); - failOver = AFCalculatorImplementation.EXACT_GENERAL_PLOIDY.newInstance(); + failOver = AFCalculatorImplementation.EXACT_GENERAL_INDEPENDENT.newInstance(); failOver.setLogger(logger); } diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/IndependentAlleleAFCalculationResult.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/IndependentAlleleAFCalculationResult.java new file mode 100644 index 000000000..491c2a2f5 --- /dev/null +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/IndependentAlleleAFCalculationResult.java @@ -0,0 +1,75 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE +* SOFTWARE LICENSE AGREEMENT +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. PHONE-HOME FEATURE +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012-2015 Broad Institute, Inc. +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 5. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 6. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 7. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 8. MISCELLANEOUS +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ +package org.broadinstitute.gatk.tools.walkers.genotyper.afcalc; + +import htsjdk.variant.variantcontext.Allele; + +import java.util.List; +import java.util.Map; + +/** + * Trivial subclass that helps with debugging by keeping track of the supporting information for this joint call + */ +class IndependentAlleleAFCalculationResult extends AFCalculationResult { + /** + * List of the supporting bi-allelic AFCalcResults that went into making this multi-allelic joint call + */ + final List supporting; + + IndependentAlleleAFCalculationResult(final int[] alleleCountsOfMLE, final int nEvaluations, + final List allelesUsedInGenotyping, final double[] log10LikelihoodsOfAC, + final double[] log10PriorsOfAC, + final Map log10pRefByAllele, final List supporting) { + super(alleleCountsOfMLE, nEvaluations, allelesUsedInGenotyping, log10LikelihoodsOfAC, + log10PriorsOfAC, log10pRefByAllele); + this.supporting = supporting; + } +} diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalculator.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalculator.java index 2c9cb2a36..f95c4f648 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalculator.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalculator.java @@ -109,22 +109,14 @@ import java.util.*; * XB = AB + BC * BB = BB * - * After each allele has its probability calculated we compute the joint posterior - * as P(D | AF_* == 0) = prod_i P (D | AF_i == 0), after applying the theta^i - * prior for the ith least likely allele. + * The posterior of the site being a variant site is calculated using + * the likelihood of the AF whe all alternatives are collapsed to be zero. */ public class IndependentAllelesDiploidExactAFCalculator extends DiploidExactAFCalculator { - /** - * The min. confidence of an allele to be included in the joint posterior. - */ - private final static double MIN_LOG10_CONFIDENCE_TO_INCLUDE_ALLELE_IN_POSTERIOR = Math.log10(1e-10); - private final static int[] BIALLELIC_NON_INFORMATIVE_PLS = new int[]{0,0,0}; private final static List BIALLELIC_NOCALL = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL); - - /** * Sorts AFCalcResults by their posteriors of AF > 0, so the */ @@ -147,21 +139,6 @@ import java.util.*; biAlleleExactModel = new ReferenceDiploidExactAFCalculator(); } - /** - * Trivial subclass that helps with debugging by keeping track of the supporting information for this joint call - */ - private static class MyAFCalculationResult extends AFCalculationResult { - /** - * List of the supporting bi-allelic AFCalcResults that went into making this multi-allelic joint call - */ - final List supporting; - - private MyAFCalculationResult(int[] alleleCountsOfMLE, int nEvaluations, List allelesUsedInGenotyping, double[] log10LikelihoodsOfAC, double[] log10PriorsOfAC, Map log10pRefByAllele, List supporting) { - super(alleleCountsOfMLE, nEvaluations, allelesUsedInGenotyping, log10LikelihoodsOfAC, log10PriorsOfAC, log10pRefByAllele); - this.supporting = supporting; - } - } - @Override public AFCalculationResult computeLog10PNonRef(final VariantContext vc, final int defaultPloidy, final double[] log10AlleleFrequencyPriors, final StateTracker stateTracker) { @@ -247,21 +224,6 @@ import java.util.*; return results; } - /** - * Helper function to ensure that the computeAlleleIndependentExact is returning reasonable results - */ - private static boolean goodIndependentResult(final VariantContext vc, final List results) { - if ( results.size() != vc.getNAlleles() - 1) return false; - for ( int i = 0; i < results.size(); i++ ) { - if ( results.get(i).getAllelesUsedInGenotyping().size() != 2 ) - return false; - if ( ! results.get(i).getAllelesUsedInGenotyping().contains(vc.getAlternateAllele(i)) ) - return false; - } - - return true; - } - /** * Returns the bi-allelic variant context for each alt allele in vc with bi-allelic likelihoods, in order * @@ -314,65 +276,6 @@ import java.util.*; } } - /** - * Returns a new Genotype with the PLs of the multi-allelic original reduced to a bi-allelic case - * - * This is handled in the following way: - * - * Suppose we have for a A/B/C site the following GLs: - * - * AA AB BB AC BC CC - * - * and we want to get the bi-allelic GLs for X/B, where X is everything not B - * - * XX = AA + AC + CC (since X = A or C) - * XB = AB + BC - * BB = BB - * - * @param original the original multi-allelic genotype - * @param altIndex the index of the alt allele we wish to keep in the bialleic case -- with ref == 0 - * @param nAlts the total number of alt alleles - * @return a new biallelic genotype with appropriate PLs - */ - @Requires({"original.hasLikelihoods()"}) // TODO -- add ploidy == 2 test "original.getPLs() == null || original.getPLs().length == 3"}) - @Ensures({"result.hasLikelihoods()", "result.getPL().length == 3"}) - @Deprecated - protected Genotype combineGLs(final Genotype original, final int altIndex, final int nAlts ) { - if ( original.isNonInformative() ) - return new GenotypeBuilder(original).PL(BIALLELIC_NON_INFORMATIVE_PLS).alleles(BIALLELIC_NOCALL).make(); - - if ( altIndex < 1 || altIndex > nAlts ) throw new IllegalStateException("altIndex must be between 1 and nAlts " + nAlts); - - final double[] normalizedPr = MathUtils.normalizeFromLog10(GenotypeLikelihoods.fromPLs(original.getPL()).getAsVector()); - final double[] biAllelicPr = new double[3]; - - for ( int index = 0; index < normalizedPr.length; index++ ) { - final GenotypeLikelihoods.GenotypeLikelihoodsAllelePair pair = GenotypeLikelihoods.getAllelePair(index); - - if ( pair.alleleIndex1 == altIndex ) { - if ( pair.alleleIndex2 == altIndex ) - // hom-alt case - biAllelicPr[2] = normalizedPr[index]; - else - // het-alt case - biAllelicPr[1] += normalizedPr[index]; - } else { - if ( pair.alleleIndex2 == altIndex ) - // het-alt case - biAllelicPr[1] += normalizedPr[index]; - else - // hom-non-alt - biAllelicPr[0] += normalizedPr[index]; - } - } - - final double[] GLs = new double[3]; - for ( int i = 0; i < GLs.length; i++ ) GLs[i] = Math.log10(biAllelicPr[i]); - - return new GenotypeBuilder(original).PL(GLs).alleles(BIALLELIC_NOCALL).make(); - } - - private static final double PHRED_2_LOG10_COEFF = -.1; /** @@ -504,7 +407,7 @@ import java.util.*; nEvaluations += sortedResultWithThetaNPriors.nEvaluations; } - return new MyAFCalculationResult(alleleCountsOfMLE, nEvaluations, vc.getAlleles(), + return new IndependentAlleleAFCalculationResult(alleleCountsOfMLE, nEvaluations, vc.getAlleles(), // necessary to ensure all values < 0 MathUtils.normalizeFromLog10(new double[] { combinedAltAllelesResult.getLog10LikelihoodOfAFEq0(), combinedAltAllelesResult.getLog10LikelihoodOfAFGT0() }, true), // priors incorporate multiple alt alleles, must be normalized diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/IndependentAllelesExactAFCalculator.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/IndependentAllelesExactAFCalculator.java new file mode 100644 index 000000000..c1078a919 --- /dev/null +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/IndependentAllelesExactAFCalculator.java @@ -0,0 +1,553 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE +* SOFTWARE LICENSE AGREEMENT +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. PHONE-HOME FEATURE +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012-2015 Broad Institute, Inc. +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 5. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 6. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 7. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 8. MISCELLANEOUS +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ +package org.broadinstitute.gatk.tools.walkers.genotyper.afcalc; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import htsjdk.variant.variantcontext.*; +import org.broadinstitute.gatk.tools.walkers.genotyper.GeneralPloidyGenotypeLikelihoods; +import org.broadinstitute.gatk.tools.walkers.genotyper.GenotypeAlleleCounts; +import org.broadinstitute.gatk.tools.walkers.genotyper.GenotypeLikelihoodCalculator; +import org.broadinstitute.gatk.tools.walkers.genotyper.GenotypeLikelihoodCalculators; +import org.broadinstitute.gatk.utils.MathUtils; +import org.broadinstitute.gatk.utils.variant.GATKVCFConstants; +import org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils; + +import java.util.*; + +/** + * Independent allele exact AF calculator for any ploidy. + * + *

+ * The method is described in {@link IndependentAllelesDiploidExactAFCalculator} for diploids. + *

+ * + * @author Valentin Ruano-Rubio <valentin@broadinstitute.org> + */ +public class IndependentAllelesExactAFCalculator extends ExactAFCalculator { + + private static final int MAX_LENGTH_FOR_POOL_PL_LOGGING = 100; // if PL vectors longer than this # of elements, don't log them + + /** + * Array that caches the allele list that corresponds to the ith ploidy. + * + *

+ * Each position of the array imakes reference to a list that contains i copies of {@link Allele#NO_CALL}. + *

+ * + *

+ * This array must be queried using {@link #biallelicNoCall(int)}, which will extend the cache + * to larger ploidies if needed. + *

+ */ + private static volatile List[] BIALLELIC_NOCALL = initialBiallelicNoCall(10); + + /** + * Array that caches the allele list that corresponds to the ith ploidy. + * + *

+ * Each position of the array imakes reference to an array that contains + * all-zero likelihoods with the number of genotypes that correspond + * to a biallelic variant with ploidy i. + *

+ * + *

+ * This array must be queried using {@link #biallelicNonInformativePls(int)}, which will extend the cache + * to larger ploidies if needed. + *

+ */ + private static volatile int[][] BIALLELIC_NON_INFORMATIVE_PLS_BY_PLOIDY = initialBiallelicNonInformativePLsByPloidy(10); + + private static final Comparator AFCALC_RESULT_BY_PNONREF_COMPARATOR = new Comparator() { + @Override + @Requires("o1 != null && o1 != null") + public int compare(final AFCalculationResult o1, final AFCalculationResult o2) { + return -1 * Double.compare(o1.getLog10PosteriorOfAFGT0(), o2.getLog10PosteriorOfAFGT0()); + } + }; + + private final ExactAFCalculator biallelicExactAFCalculator; + + protected IndependentAllelesExactAFCalculator(final ExactAFCalculator biallelicExactAFCalculator) { + if (biallelicExactAFCalculator == null) + throw new IllegalArgumentException("the biallelic exact AF calculator cannot be null"); + this.biallelicExactAFCalculator = biallelicExactAFCalculator; + } + + /** + * Creates a new calculator that delegates on {@link GeneralPloidyExactAFCalculator} to run + * the exact model per allele. + * + *

+ * Note: this constructor may be called using reflexion. + *

+ */ + @SuppressWarnings("unused") + protected IndependentAllelesExactAFCalculator() { + this(new GeneralPloidyExactAFCalculator()); + } + + @Override + @Requires("vc != null && likelihoodSums != null") + protected void reduceScopeCalculateLikelihoodSums(final VariantContext vc, final int defaultPloidy, final LikelihoodSum[] likelihoodSums) { + final int numOriginalAltAlleles = likelihoodSums.length; + final GenotypesContext genotypes = vc.getGenotypes(); + for ( final Genotype genotype : genotypes.iterateInSampleNameOrder() ) { + if (!genotype.hasPL()) + continue; + final double[] gls = genotype.getLikelihoods().getAsVector(); + if (MathUtils.sum(gls) >= GATKVariantContextUtils.SUM_GL_THRESH_NOCALL) + continue; + + final int PLindexOfBestGL = MathUtils.maxElementIndex(gls); + + final double bestToHomRefDiffGL = PLindexOfBestGL == PL_INDEX_OF_HOM_REF ? 0.0 : gls[PLindexOfBestGL] - gls[PL_INDEX_OF_HOM_REF]; + final int declaredPloidy = genotype.getPloidy(); + final int ploidy = declaredPloidy <= 0 ? defaultPloidy : declaredPloidy; + + final int[] acCount = GeneralPloidyGenotypeLikelihoods.getAlleleCountFromPLIndex(1 + numOriginalAltAlleles, ploidy, PLindexOfBestGL); + // by convention, first count coming from getAlleleCountFromPLIndex comes from reference allele + for (int k=1; k < acCount.length;k++) + if (acCount[k] > 0 ) + likelihoodSums[k-1].sum += acCount[k] * bestToHomRefDiffGL; + } + } + + @Override + protected GenotypesContext reduceScopeGenotypes(final VariantContext vc, final int defaultPloidy, final List allelesToUse) { + return subsetAlleles(vc,defaultPloidy,allelesToUse,false); + } + + @Override + @Requires("vc != null && log10AlleleFrequencyPriors != null && stateTracker != null") + protected AFCalculationResult computeLog10PNonRef(final VariantContext vc, final int defaultPloidy, final double[] log10AlleleFrequencyPriors, final StateTracker stateTracker) { + final List independentResultTrackers = computeAlleleIndependentExact(vc, defaultPloidy, log10AlleleFrequencyPriors); + + // Paranoia check: + if ( independentResultTrackers.size() <= 1 ) + throw new IllegalStateException("Independent alleles model returned an empty list of results at VC " + vc); + else if ( independentResultTrackers.size() == 2 ) { + // fast path for the very common bi-allelic use case + return independentResultTrackers.get(1); + } else { + final List alternativesOnly = new ArrayList<>(independentResultTrackers.size() - 1); + for (int i = 1; i < independentResultTrackers.size(); i++) + alternativesOnly.add(independentResultTrackers.get(i)); + // we are a multi-allelic, so we need to actually combine the results + final List withMultiAllelicPriors = applyMultiAllelicPriors(alternativesOnly); + return combineIndependentPNonRefs(vc, withMultiAllelicPriors, independentResultTrackers.get(0)); + } + } + + @Requires("conditionalPNonRefResults != null and !conditionalPNonRefResults.empty()") + protected final List applyMultiAllelicPriors(final List conditionalPNonRefResults) { + final ArrayList sorted = new ArrayList(conditionalPNonRefResults); + + // sort the results, so the most likely allele is first + Collections.sort(sorted, AFCALC_RESULT_BY_PNONREF_COMPARATOR); + + double lastPosteriorGt0 = sorted.get(0).getLog10PosteriorOfAFGT0(); + final double log10SingleAllelePriorOfAFGt0 = conditionalPNonRefResults.get(0).getLog10PriorOfAFGT0(); + + for ( int i = 0; i < sorted.size(); i++ ) { + if ( sorted.get(i).getLog10PosteriorOfAFGT0() > lastPosteriorGt0 ) + throw new IllegalStateException("pNonRefResults not sorted: lastPosteriorGt0 " + lastPosteriorGt0 + " but current is " + sorted.get(i).getLog10PosteriorOfAFGT0()); + + final double log10PriorAFGt0 = (i + 1) * log10SingleAllelePriorOfAFGt0; + final double log10PriorAFEq0 = Math.log10(1 - Math.pow(10, log10PriorAFGt0)); + final double[] thetaTONPriors = new double[] { log10PriorAFEq0, log10PriorAFGt0 }; + + // bind pNonRef for allele to the posterior value of the AF > 0 with the new adjusted prior + sorted.set(i, sorted.get(i).withNewPriors(MathUtils.normalizeFromLog10(thetaTONPriors, true))); + } + + return sorted; + } + + /** + * Take the independent estimates of pNonRef for each alt allele and combine them into a single result + * + * Given n independent calculations for each of n alternate alleles create a single + * combined AFCalcResult with: + * + * priors for AF == 0 equal to theta^N for the nth least likely allele + * posteriors that reflect the combined chance that any alleles are segregating and corresponding + * likelihoods + * combined MLEs in the order of the alt alleles in vc + * + * @param sortedResultsWithThetaNPriors the pNonRef result for each allele independently + */ + @Requires("vc != null && sortedResultsWithThetaNPriors != null && combinedAltAllelesResult != null") + protected AFCalculationResult combineIndependentPNonRefs(final VariantContext vc, + final List sortedResultsWithThetaNPriors, + final AFCalculationResult combinedAltAllelesResult) { + + + int nEvaluations = 0; + final int nAltAlleles = sortedResultsWithThetaNPriors.size(); + final int[] alleleCountsOfMLE = new int[nAltAlleles]; + final Map log10pRefByAllele = new HashMap<>(nAltAlleles); + + // the sum of the log10 posteriors for AF == 0 and AF > 0 to determine joint probs + + for ( final AFCalculationResult sortedResultWithThetaNPriors : sortedResultsWithThetaNPriors ) { + final Allele altAllele = sortedResultWithThetaNPriors.getAllelesUsedInGenotyping().get(1); + final int altI = vc.getAlleles().indexOf(altAllele) - 1; + + // MLE of altI allele is simply the MLE of this allele in altAlleles + alleleCountsOfMLE[altI] = sortedResultWithThetaNPriors.getAlleleCountAtMLE(altAllele); + + // bind pNonRef for allele to the posterior value of the AF > 0 with the new adjusted prior + log10pRefByAllele.put(altAllele, sortedResultWithThetaNPriors.getLog10PosteriorOfAFEq0()); + + // trivial -- update the number of evaluations + nEvaluations += sortedResultWithThetaNPriors.nEvaluations; + } + + return new IndependentAlleleAFCalculationResult(alleleCountsOfMLE, nEvaluations, vc.getAlleles(), + // necessary to ensure all values < 0 + MathUtils.normalizeFromLog10(new double[] { combinedAltAllelesResult.getLog10LikelihoodOfAFEq0(), combinedAltAllelesResult.getLog10LikelihoodOfAFGT0() }, true), + // priors incorporate multiple alt alleles, must be normalized + MathUtils.normalizeFromLog10(new double[] { combinedAltAllelesResult.getLog10PriorOfAFEq0(), combinedAltAllelesResult.getLog10PriorOfAFGT0() }, true), + log10pRefByAllele, sortedResultsWithThetaNPriors); + } + + /** + * Compute the conditional exact AFCalcResult for each allele in vc independently, returning + * the result of each, in order of the alt alleles in VC + * + * @param vc the VariantContext we want to analyze, with at least 1 alt allele + * @param log10AlleleFrequencyPriors the priors + * @return a list of the AFCalcResults for each bi-allelic sub context of vc + */ + @Requires({"vc != null", "log10AlleleFrequencyPriors != null"}) + @Ensures("goodIndependentResult(vc, result)") + protected final List computeAlleleIndependentExact(final VariantContext vc, final int defaultPloidy, + final double[] log10AlleleFrequencyPriors) { + final List results = new LinkedList<>(); + + for ( final VariantContext subvc : makeAlleleConditionalContexts(vc, defaultPloidy) ) { + final AFCalculationResult resultTracker = biallelicExactAFCalculator.getLog10PNonRef(subvc, defaultPloidy, vc.getNAlleles() - 1, log10AlleleFrequencyPriors); + results.add(resultTracker); + } + + return results; + } + + /** + * Returns the bi-allelic variant context for each alt allele in vc with bi-allelic likelihoods, in order + * + * @param vc the variant context to split. Must have n.alt.alleles > 1 + * @return a bi-allelic variant context for each alt allele in vc + */ + @Requires({"vc != null", "vc.getNAlleles() > 1"}) + @Ensures("result.size() == vc.getNAlleles() - 1") + protected final List makeAlleleConditionalContexts(final VariantContext vc, final int defaultPloidy) { + final int nAlleles = vc.getNAlleles(); + + // go through the work of ripping up the VC into its biallelic components + final List vcs = new LinkedList<>(); + + for ( int alleleIndex = 0; alleleIndex < nAlleles; alleleIndex++ ) { + vcs.add(biallelicCombinedGLs(vc, defaultPloidy, alleleIndex)); + } + return vcs; + } + + /** + * Create a single bi-allelic variant context from rootVC with alt allele with index altAlleleIndex + * + * @param rootVC the root (potentially multi-allelic) variant context + * @param alleleIndex index of the alt allele, from 0 == reference + * @return a bi-allelic variant context based on rootVC + */ + @Requires({"rootVC.getNAlleles() > 1", "altAlleleIndex < rootVC.getNAlleles()"}) + @Ensures({"result.isBiallelic()"}) + protected final VariantContext biallelicCombinedGLs(final VariantContext rootVC, final int defaultPloidy, final int alleleIndex) { + if ( rootVC.isBiallelic() ) { + return rootVC; + } else { + final int nAlleles = rootVC.getNAlleles(); + final List biallelicGenotypes = new ArrayList<>(rootVC.getNSamples()); + for ( final Genotype g : rootVC.getGenotypes() ) + biallelicGenotypes.add(combineGLs(g, defaultPloidy, alleleIndex, nAlleles)); + + final VariantContextBuilder vcb = new VariantContextBuilder(rootVC); + final Allele allele = alleleIndex == 0 ? rootVC.getReference() : rootVC.getAlternateAllele(alleleIndex - 1); + vcb.alleles(alleleIndex == 0 ? Arrays.asList(allele, GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE) : Arrays.asList(rootVC.getReference(), allele)); + vcb.genotypes(biallelicGenotypes); + return vcb.make(); + } + } + + /** + * Returns a new Genotype with the PLs of the multi-allelic original reduced to a bi-allelic case. + * + *

Uses the log-sum-exp trick in order to work well with very low PLs

+ * + *

This is handled in the following way:

+ * + *

Suppose we have for a A/B/C site the following GLs:

+ * + *

AA AB BB AC BC CC

+ * + *

and we want to get the bi-allelic GLs for X/B, where X is everything not B

+ * + *

XX = AA + AC + CC (since X = A or C)
+ * XB = AB + BC
+ * BB = BB
+ *

+ *

+ * This implementation uses the log-sum-exp trick in order to avoid numeric instability (underflow). + *

+ * + * @param original the original multi-allelic genotype + * @param alleleIndex the index of the alt allele we wish to keep in the bialleic case -- with ref == 0 + * @param numberOfAlleles the total number of alleles (alternatives + the reference). + * @return a new biallelic genotype with appropriate PLs + */ + @Requires({"original.hasLikelihoods() && alleleIndex >= 0"}) + @Ensures({"result.hasLikelihoods()"}) + private Genotype combineGLs(final Genotype original, final int defaultPloidy, final int alleleIndex, final int numberOfAlleles ) { + + final int declaredPloidy = original.getPloidy(); + final int ploidy = declaredPloidy <= 0 ? defaultPloidy : declaredPloidy; + if ( original.isNonInformative() ) + return new GenotypeBuilder(original).PL(biallelicNonInformativePls(ploidy)).alleles(biallelicNoCall(ploidy)).make(); + + final int[] pls = original.getPL(); + + final GenotypeLikelihoodCalculator calculator = GenotypeLikelihoodCalculators.getInstance(ploidy, numberOfAlleles); + final double[] newPLs = new double[ploidy + 1]; + Arrays.fill(newPLs, Double.NEGATIVE_INFINITY); + for (int i = 0; i < pls.length; i++) { + final GenotypeAlleleCounts alleleCounts = calculator.genotypeAlleleCountsAt(i); + final int alleleCount = alleleCounts.alleleCountFor(alleleIndex); + final int newPLIndex = alleleIndex == 0 ? ploidy - alleleCount : alleleCount; + newPLs[newPLIndex] = MathUtils.approximateLog10SumLog10(newPLs[newPLIndex], -.1 * pls[i]); + } + + return new GenotypeBuilder(original).PL(newPLs).alleles(biallelicNoCall(ploidy)).make(); + } + + private static List[] initialBiallelicNoCall(final int initialCapacity) { + final List[] result = new List[initialCapacity + 1]; + for (int i = 0; i < result.length; i++) { + result[i] = GATKVariantContextUtils.noCallAlleles(i); + } + return result; + } + + private static int[][] initialBiallelicNonInformativePLsByPloidy(final int initialCapacity) { + final int[][] result = new int[initialCapacity + 1][]; + for (int i = 0; i < result.length; i++) + result[i] = new int[i]; // { 0, 0, 0 ... 0} is the actual uninformative PL array. + return result; + } + + /** + * Returns a cached array of non-informative PLs (all 0) for a given ploidy. + *

+ * Calling code must never change its elements. + *

+ * @param ploidy the required ploidy. + * @return never {@code null}. + */ + @Requires("ploidy >= 0") + private static int[] biallelicNonInformativePls (final int ploidy) { + if (ploidy >= BIALLELIC_NON_INFORMATIVE_PLS_BY_PLOIDY.length) { + return enlargeIfNecessaryBiallelicNonInformativePlsByPloidyAndGet(ploidy); + } else { + return BIALLELIC_NON_INFORMATIVE_PLS_BY_PLOIDY[ploidy]; + } + } + + /** + * Thread-safe expansion of {@link #BIALLELIC_NON_INFORMATIVE_PLS_BY_PLOIDY}. + * @param ploidy the requested ploidy. + * @return the uninformative likelihoods array for the requested ploidy. + */ + private static synchronized int[] enlargeIfNecessaryBiallelicNonInformativePlsByPloidyAndGet(final int ploidy) { + if (ploidy >= BIALLELIC_NON_INFORMATIVE_PLS_BY_PLOIDY.length) { + final int[][] newValue = Arrays.copyOf(BIALLELIC_NON_INFORMATIVE_PLS_BY_PLOIDY, ploidy * 2); + for (int i = newValue.length - 1; i >= BIALLELIC_NON_INFORMATIVE_PLS_BY_PLOIDY.length; i--) + newValue[i] = new int[i]; // { 0, 0, 0.. } is the actual uninformative PL array. + BIALLELIC_NON_INFORMATIVE_PLS_BY_PLOIDY = newValue; + } + return BIALLELIC_NON_INFORMATIVE_PLS_BY_PLOIDY[ploidy]; + } + + /** + * Returns a cached list of no-call alleles {@link Allele#NO_CALL} that correspond to a given ploidy. + *

+ * Calling code must never change its elements. + *

+ * @param ploidy the required ploidy. + * @return never {@code null}. + */ + private static List biallelicNoCall (final int ploidy) { + if (ploidy >= BIALLELIC_NOCALL.length) { + return enlargeIfNecessaryBiallelicNoCallAndGet(ploidy); + } else { + return BIALLELIC_NOCALL[ploidy]; + } + } + + /** + * Thread-safe expansion of {@link #BIALLELIC_NOCALL}. + * @param ploidy the requested ploidy. + * @return the no-call allele list for the requested ploidy. + */ + private static synchronized List enlargeIfNecessaryBiallelicNoCallAndGet(final int ploidy) { + if (ploidy >= BIALLELIC_NOCALL.length) { + final List[] newValue = Arrays.copyOf(BIALLELIC_NOCALL, ploidy * 2); + for (int i = newValue.length - 1; i >= BIALLELIC_NOCALL.length; i--) + newValue[i] = GATKVariantContextUtils.noCallAlleles(i); + BIALLELIC_NOCALL = newValue; + } + return BIALLELIC_NOCALL[ploidy]; + } + + @Override + @Requires("vc != null && allelesToUse != null") + public GenotypesContext subsetAlleles(VariantContext vc, int defaultPloidy, List allelesToUse, boolean assignGenotypes) { + // the genotypes with PLs + final GenotypesContext oldGTs = vc.getGenotypes(); + + // samples + final List sampleIndices = oldGTs.getSampleNamesOrderedByName(); + + // the new genotypes to create + final GenotypesContext newGTs = GenotypesContext.create(); + + // we need to determine which of the alternate alleles (and hence the likelihoods) to use and carry forward + final int numOriginalAltAlleles = vc.getAlternateAlleles().size(); + final int numNewAltAlleles = allelesToUse.size() - 1; + + + // create the new genotypes + for ( int k = 0; k < oldGTs.size(); k++ ) { + final Genotype g = oldGTs.get(sampleIndices.get(k)); + final int declaredPloidy = g.getPloidy(); + final int ploidy = declaredPloidy <= 0 ? defaultPloidy : declaredPloidy; + if ( !g.hasLikelihoods() ) { + newGTs.add(GenotypeBuilder.create(g.getSampleName(),GATKVariantContextUtils.noCallAlleles(ploidy))); + continue; + } + + // create the new likelihoods array from the alleles we are allowed to use + final double[] originalLikelihoods = g.getLikelihoods().getAsVector(); + double[] newLikelihoods; + + // Optimization: if # of new alt alleles = 0 (pure ref call), keep original likelihoods so we skip normalization + // and subsetting + if ( numOriginalAltAlleles == numNewAltAlleles || numNewAltAlleles == 0) { + newLikelihoods = originalLikelihoods; + } else { + newLikelihoods = GeneralPloidyGenotypeLikelihoods.subsetToAlleles(originalLikelihoods, ploidy, vc.getAlleles(), allelesToUse); + + // might need to re-normalize + newLikelihoods = MathUtils.normalizeFromLog10(newLikelihoods, false, true); + } + + // if there is no mass on the (new) likelihoods, then just no-call the sample + if ( MathUtils.sum(newLikelihoods) > GATKVariantContextUtils.SUM_GL_THRESH_NOCALL ) { + newGTs.add(GenotypeBuilder.create(g.getSampleName(), GATKVariantContextUtils.noCallAlleles(ploidy))); + } + else { + final GenotypeBuilder gb = new GenotypeBuilder(g); + + if ( numNewAltAlleles == 0 ) + gb.noPL(); + else + gb.PL(newLikelihoods); + + // if we weren't asked to assign a genotype, then just no-call the sample + if ( !assignGenotypes || MathUtils.sum(newLikelihoods) > GATKVariantContextUtils.SUM_GL_THRESH_NOCALL ) + gb.alleles(GATKVariantContextUtils.noCallAlleles(ploidy)); + else + assignGenotype(gb, newLikelihoods, allelesToUse, ploidy); + newGTs.add(gb.make()); + } + } + + return GATKVariantContextUtils.fixADFromSubsettedAlleles(newGTs, vc, allelesToUse); + } + + + /** + * Assign genotypes (GTs) to the samples in the Variant Context greedily based on the PLs + * + * @param newLikelihoods the PL array + * @param allelesToUse the list of alleles to choose from (corresponding to the PLs) + * @param numChromosomes Number of chromosomes per pool + */ + private void assignGenotype(final GenotypeBuilder gb, + final double[] newLikelihoods, + final List allelesToUse, + final int numChromosomes) { + final int numNewAltAlleles = allelesToUse.size() - 1; + + // find the genotype with maximum likelihoods + final int PLindex = numNewAltAlleles == 0 ? 0 : MathUtils.maxElementIndex(newLikelihoods); + final GenotypeLikelihoodCalculator calculator = GenotypeLikelihoodCalculators.getInstance(numChromosomes,allelesToUse.size()); + final GenotypeAlleleCounts alleleCounts = calculator.genotypeAlleleCountsAt(PLindex); + + gb.alleles(alleleCounts.asAlleleList(allelesToUse)); + + // remove PLs if necessary + if (newLikelihoods.length > MAX_LENGTH_FOR_POOL_PL_LOGGING) + gb.noPL(); + + if ( numNewAltAlleles > 0 ) + gb.log10PError(GenotypeLikelihoods.getGQLog10FromLikelihoods(PLindex, newLikelihoods)); + } +} diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite1IntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite1IntegrationTest.java index 8add60421..a1de74780 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite1IntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite1IntegrationTest.java @@ -74,7 +74,7 @@ public class UnifiedGenotyperGeneralPloidySuite1IntegrationTest extends WalkerTe @Test(enabled = true) public void testBOTH_GGA_Pools() { - executor.PC_LSV_Test(String.format("-A AlleleCountBySample -maxAltAlleles 2 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s", LSV_ALLELES), "LSV_BOTH_GGA", "BOTH", "25a92ef3c727e8daf49d7413cd943fbb"); + executor.PC_LSV_Test(String.format("-A AlleleCountBySample -maxAltAlleles 2 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s", LSV_ALLELES), "LSV_BOTH_GGA", "BOTH", "217d9108c3014261dbe8befa383a2226"); } @Test(enabled = true) @@ -84,6 +84,10 @@ public class UnifiedGenotyperGeneralPloidySuite1IntegrationTest extends WalkerTe @Test(enabled = true) public void testINDEL_maxAltAlleles2_ploidy1_Pools_noRef() { - executor.PC_LSV_Test_NoRef("-A AlleleCountBySample -maxAltAlleles 2 -ploidy 1", "LSV_INDEL_DISC_NOREF_p1", "INDEL", "b5ff7530827f4b9039a58bdc8a3560d2"); + //TODO interesting case in where the faster but approxiate allele independent Exact AC approach causes an additional allele to pop up!. + //TODO the old MD5 is kept for the record. + //TODO this should be revisit once we get into addressing inaccuracies by the independent allele approach. +// executor.PC_LSV_Test_NoRef("-A AlleleCountBySample -maxAltAlleles 2 -ploidy 1", "LSV_INDEL_DISC_NOREF_p1", "INDEL", "b5ff7530827f4b9039a58bdc8a3560d2"); + executor.PC_LSV_Test_NoRef("-A AlleleCountBySample -maxAltAlleles 2 -ploidy 1", "LSV_INDEL_DISC_NOREF_p1", "INDEL", "c0271a4c281991a86490c1955456af26"); } } diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite2IntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite2IntegrationTest.java index aeaabb9d4..2fb2d11c5 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite2IntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotyperGeneralPloidySuite2IntegrationTest.java @@ -63,7 +63,7 @@ public class UnifiedGenotyperGeneralPloidySuite2IntegrationTest extends WalkerTe @Test(enabled = true) public void testINDEL_maxAltAlleles2_ploidy3_Pools_noRef() { - executor.PC_LSV_Test_NoRef("-A AlleleCountBySample -maxAltAlleles 2 -ploidy 3","LSV_INDEL_DISC_NOREF_p3","INDEL","e8422cbaf37bd5dce3c0358115b5af32"); + executor.PC_LSV_Test_NoRef("-A AlleleCountBySample -maxAltAlleles 2 -ploidy 3","LSV_INDEL_DISC_NOREF_p3","INDEL","c445bcd828c540c009bc8fc9f48916bc"); } @Test(enabled = true) diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/GeneralPloidyFailOverAFCalculatorProviderUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/GeneralPloidyFailOverAFCalculatorProviderUnitTest.java index 5b8bdafbf..a43d330a6 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/GeneralPloidyFailOverAFCalculatorProviderUnitTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/genotyper/afcalc/GeneralPloidyFailOverAFCalculatorProviderUnitTest.java @@ -88,7 +88,7 @@ public class GeneralPloidyFailOverAFCalculatorProviderUnitTest { Assert.assertNotNull(failOver); final AFCalculatorImplementation failOverImplementation = AFCalculatorImplementation.fromCalculatorClass(failOver.getClass()); Assert.assertTrue(failOverImplementation.usableForParams(PLOIDIES[i],MAX_ALT_ALLELES[j])); - Assert.assertEquals(failOverImplementation, AFCalculatorImplementation.EXACT_GENERAL_PLOIDY); + Assert.assertEquals(failOverImplementation, AFCalculatorImplementation.EXACT_GENERAL_INDEPENDENT); } } } diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java index d13905cfb..ba7676654 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java @@ -75,9 +75,13 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest { final String WExIntervals = "-L 20:10,000,000-10,100,000 -isr INTERSECTION -L " + hg19Chr20Intervals; // this functionality can be adapted to provide input data for whatever you might want in your data - tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.NONE, PCRFreeIntervals, "7f09c261950bf86e435edfa69ed2ec71"}); - tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "70605a2070d6d614d70b8547eedad2cd"}); - tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "c439e51453a031ca60fe96b75423e589"}); + //TODO the latest independent exact AC calculation for haploids causes a clear variant to be lost here. + //TODO this might need to be addressed at some point. + //TODO the following test is commented out for the record + //tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.NONE, PCRFreeIntervals, "7f09c261950bf86e435edfa69ed2ec71"}); + tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.NONE, PCRFreeIntervals, "02300a1f64e085cc0f4420d8160743c1"}); + tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "f71ea433b1334a2d146cc6ad76b46d98"}); + tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "daddb5349c34e9190f0563e220894748"}); tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.NONE, WExIntervals, "03e5815cc351f1ec2feed89d2aed8268"}); tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.BP_RESOLUTION, WExIntervals, "5d5cca382bdf6987b2aef87918ed374c"}); tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.GVCF, WExIntervals, "532888aff96c356397a21ef790636818"}); @@ -117,12 +121,12 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest { final String WExIntervals = "-L 20:10,000,000-10,100,000 -isr INTERSECTION -L " + hg19Chr20Intervals; // this functionality can be adapted to provide input data for whatever you might want in your data - tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.NONE, PCRFreeIntervals, "4c33ec3f6b9dce74e545405588c76ed1"}); - tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "d38becbac8a2e0c4b77e9cc11c1ba891"}); - tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "57c0f8baaa34f4e0300eaec7b938970b"}); + tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.NONE, PCRFreeIntervals, "5088d6cf80a6da7c769f97b1ab44c745"}); + tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "0895f09731d0ef89ec131c9f75aafe70"}); + tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "195017f498d3508e6eee492eb00da97b"}); tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.NONE, WExIntervals, "4f8e3e249509a24da21d5dd8e3594f92"}); - tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.BP_RESOLUTION, WExIntervals, "aeb64e3cb8d141aa3f8ac7506db15e19"}); - tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.GVCF, WExIntervals, "03af03feb96e402048801ecfa75344ad"}); + tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.BP_RESOLUTION, WExIntervals, "ef96f5295b048ef31f5ba82d078a44a2"}); + tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.GVCF, WExIntervals, "cd809158689ddbbfd18a4eaae016f9a0"}); return tests.toArray(new Object[][]{}); } @@ -135,12 +139,12 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest { final String WExIntervals = "-L 20:10,000,000-10,100,000 -isr INTERSECTION -L " + hg19Chr20Intervals; // this functionality can be adapted to provide input data for whatever you might want in your data - tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.NONE, PCRFreeIntervals, "0bec872d0dc72cc68538cfbd236e9933"}); - tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "694405c8f6497b5ce99145538c325cba"}); - tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "47ddf81ed143451aeb27969d397ce4ac"}); + tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.NONE, PCRFreeIntervals, "bb85582fcc2ce45407640fa8a70421ab"}); + tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.BP_RESOLUTION, PCRFreeIntervals, "e01603c5d4c3142978a91b8cd6a98618"}); + tests.add(new Object[]{NA12878_PCRFREE, ReferenceConfidenceMode.GVCF, PCRFreeIntervals, "b9e471935b960a7dba3eb2b13939ccaf"}); tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.NONE, WExIntervals, "c724ecdaea7eab5a6239ff4daaa6e034"}); - tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.BP_RESOLUTION, WExIntervals, "f80cc551fe7763bacc46ea6d56582bba"}); - tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.GVCF, WExIntervals, "43ef1e04d015c2bd9d513a67a6fc7a02"}); + tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.BP_RESOLUTION, WExIntervals, "194d27bd2321ff1a8b895a4e9a8d2938"}); + tests.add(new Object[]{NA12878_WEx, ReferenceConfidenceMode.GVCF, WExIntervals, "4a3dfcfc2f5d27b75725346d63e0b83a"}); return tests.toArray(new Object[][]{}); } diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index 67786f3af..418bdf778 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -107,7 +107,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSampleHaploid() throws IOException { - HCTest(CEUTRIO_BAM, "-ploidy 1", "0aa0bde7c7188bcf89c65606f975bbaf"); + HCTest(CEUTRIO_BAM, "-ploidy 1", "4c32640204a11fe46a23d97b1012bca2"); } @Test @@ -117,7 +117,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerSingleSampleTetraploid() throws IOException { - HCTest(NA12878_BAM, "-ploidy 4", "89a95ecb4f0326fdf62ba1bb0ee349a3"); + HCTest(NA12878_BAM, "-ploidy 4", "bbae9f77683591200b2034d5461e6425"); } @Test @@ -132,7 +132,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMinBaseQualityTetraploid() throws IOException { - HCTest(NA12878_BAM, "-mbq 15 -ploidy 4", "89a95ecb4f0326fdf62ba1bb0ee349a3"); + HCTest(NA12878_BAM, "-mbq 15 -ploidy 4", "bbae9f77683591200b2034d5461e6425"); } @Test @@ -142,7 +142,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerGraphBasedMultiSampleHaploid() throws IOException { - HCTest(CEUTRIO_BAM, "-likelihoodEngine GraphBased -ploidy 1", "88ea6597151d7c08bfdee035e43f2db6"); + HCTest(CEUTRIO_BAM, "-likelihoodEngine GraphBased -ploidy 1", "5333e3ff4d7fc53624eab801250be4f0"); } @Test @@ -171,7 +171,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSampleGGATetraploid() throws IOException { HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -ploidy 4 -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf -isr INTERSECTION -L 20:10080000-10100000", - "6e24e1f10b32a8dac3bcc27fbadce3fb"); + "6ba0fa930899f70fee9a7b1161508f93"); } @Test