From 25b8ba45f4a66e2115f4c9d786df887609580f78 Mon Sep 17 00:00:00 2001 From: Laura Gauthier Date: Fri, 23 Oct 2015 15:23:37 -0400 Subject: [PATCH] More allele-specific annotations: AS_QD and AS_InbreedingCoeff Grouped default output annotations to keep them from getting dropped when -A is specified; addresses #918 Also refactored code shared by ExcessHet and InbreedingCoeff --- .../walkers/annotator/AS_InbreedingCoeff.java | 173 ++++++++++++ .../walkers/annotator/AS_QualByDepth.java | 203 ++++++++++++++ .../walkers/annotator/AnnotationUtils.java | 38 +++ .../annotator/ClippingRankSumTest.java | 3 +- .../walkers/annotator/DepthPerSampleHC.java | 3 +- .../tools/walkers/annotator/ExcessHet.java | 115 ++++---- .../annotator/HeterozygosityUtils.java | 236 +++++++++++++++++ .../walkers/annotator/InbreedingCoeff.java | 154 +++++------ .../tools/walkers/annotator/QualByDepth.java | 52 ++-- .../interfaces/StandardHCAnnotation.java | 57 ++++ .../walkers/genotyper/GenotypingEngine.java | 44 ++- .../genotyper/UnifiedGenotypingEngine.java | 59 ++++- .../haplotypecaller/HaplotypeCaller.java | 8 +- .../walkers/variantutils/GenotypeGVCFs.java | 16 +- .../annotator/AS_InbreedingCoeffUnitTest.java | 250 ++++++++++++++++++ .../walkers/annotator/ExcessHetUnitTest.java | 2 +- .../annotator/InbreedingCoeffUnitTest.java | 32 ++- ...lexAndSymbolicVariantsIntegrationTest.java | 2 +- .../HaplotypeCallerGVCFIntegrationTest.java | 2 +- .../HaplotypeCallerIntegrationTest.java | 6 +- .../GenotypeGVCFsIntegrationTest.java | 66 +++-- .../annotator/VariantAnnotatorEngine.java | 7 +- .../gatk/utils/variant/GATKVCFConstants.java | 3 + .../utils/variant/GATKVCFHeaderLines.java | 3 + 24 files changed, 1273 insertions(+), 261 deletions(-) create mode 100644 protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_InbreedingCoeff.java create mode 100644 protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_QualByDepth.java create mode 100644 protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/HeterozygosityUtils.java create mode 100644 protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/interfaces/StandardHCAnnotation.java create mode 100644 protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_InbreedingCoeffUnitTest.java diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_InbreedingCoeff.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_InbreedingCoeff.java new file mode 100644 index 000000000..ad64730aa --- /dev/null +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_InbreedingCoeff.java @@ -0,0 +1,173 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE +* SOFTWARE LICENSE AGREEMENT +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. PHONE-HOME FEATURE +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012-2015 Broad Institute, Inc. +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 5. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 6. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 7. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 8. MISCELLANEOUS +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.gatk.tools.walkers.annotator; + +import htsjdk.variant.variantcontext.Allele; +import htsjdk.variant.variantcontext.VariantContext; +import htsjdk.variant.vcf.VCFConstants; +import htsjdk.variant.vcf.VCFHeaderLine; +import htsjdk.variant.vcf.VCFInfoHeaderLine; +import org.apache.log4j.Logger; +import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; +import org.broadinstitute.gatk.engine.walkers.Walker; +import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AS_StandardAnnotation; +import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible; +import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.InfoFieldAnnotation; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.variant.GATKVCFConstants; +import org.broadinstitute.gatk.utils.variant.GATKVCFHeaderLines; + +import java.util.*; + +/** + * Allele-specific, likelihood-based test for the inbreeding among samples + * + *

This annotation estimates whether there is evidence of inbreeding in a population. The higher the score, the higher the chance that there is inbreeding.

+ * + *

Statistical notes

+ *

The calculation is a continuous generalization of the Hardy-Weinberg test for disequilibrium that works well with limited coverage per sample. The output is the F statistic from running the HW test for disequilibrium with PL values. See the method document on statistical tests for a more detailed explanation of this statistical test.

+ * + *

Caveats

+ * + * + */ +//TODO: this can't extend InbreedingCoeff because that one is Standard and it would force this to be output all the time; should fix code duplication nonetheless +public class AS_InbreedingCoeff extends InfoFieldAnnotation implements AS_StandardAnnotation { + + private final static Logger logger = Logger.getLogger(InbreedingCoeff.class); + protected static final int MIN_SAMPLES = 10; + private Set founderIds; + private boolean didUniquifiedSampleNameCheck = false; + final private boolean RETURN_ROUNDED = false; + protected HeterozygosityUtils heterozygosityUtils; + + @Override + public void initialize ( AnnotatorCompatible walker, GenomeAnalysisEngine toolkit, Set headerLines ) { + //If available, get the founder IDs and cache them. the IC will only be computed on founders then. + if(founderIds == null && walker != null) { + founderIds = ((Walker) walker).getSampleDB().getFounderIds(); + } + if(walker != null && (((Walker) walker).getSampleDB().getSamples().size() < MIN_SAMPLES || (!founderIds.isEmpty() && founderIds.size() < MIN_SAMPLES))) + logger.warn("Annotation will not be calculated. InbreedingCoeff requires at least " + MIN_SAMPLES + " unrelated samples."); + //intialize a HeterozygosityUtils before annotating for use in unit tests + heterozygosityUtils = new HeterozygosityUtils(RETURN_ROUNDED); + } + + @Override + public List getKeyNames() { return Collections.singletonList(GATKVCFConstants.AS_INBREEDING_COEFFICIENT_KEY); } + + @Override + public List getDescriptions() { return Collections.singletonList(GATKVCFHeaderLines.getInfoLine(getKeyNames().get(0))); } + + @Override + public Map annotate(final RefMetaDataTracker tracker, + final AnnotatorCompatible walker, + final ReferenceContext ref, + final Map stratifiedContexts, + final VariantContext vc, + final Map perReadAlleleLikelihoodMap ) { + + //create a new HeterozygosityUtils to store data for each VariantContext, i.e. each annotate() call + heterozygosityUtils = new HeterozygosityUtils(RETURN_ROUNDED); + + //if none of the "founders" are in the vc samples, assume we uniquified the samples upstream and they are all founders + if (!didUniquifiedSampleNameCheck) { + founderIds = AnnotationUtils.validateFounderIDs(founderIds, vc); + didUniquifiedSampleNameCheck = true; + } + return makeCoeffAnnotation(vc); + } + + protected Map makeCoeffAnnotation(final VariantContext vc) { + final List altAlleles = vc.getAlternateAlleles(); + final List ICvalues = new ArrayList<>(); + + for (final Allele a : altAlleles) { + ICvalues.add(calculateIC(vc, a)); + } + if (heterozygosityUtils.getSampleCount() < MIN_SAMPLES) + return null; + return Collections.singletonMap(getKeyNames().get(0), (Object) AnnotationUtils.encodeValueList(ICvalues, "%.4f")); + } + + protected double calculateIC(final VariantContext vc, final Allele altAllele) { + final int AN = vc.getCalledChrCount(); + final double altAF; + + final double hetCount = heterozygosityUtils.getHetCount(vc, altAllele); + + final double F; + //shortcut to get a value closer to the non-alleleSpecific value for bialleleics + if (vc.isBiallelic()) { + double refAC = heterozygosityUtils.getAlleleCount(vc, vc.getReference()); + double altAC = heterozygosityUtils.getAlleleCount(vc, altAllele); + double refAF = refAC/(altAC+refAC); + altAF = 1 - refAF; + F = 1.0 - (hetCount / (2.0 * refAF * altAF * (double) heterozygosityUtils.getSampleCount())); // inbreeding coefficient + } + else { + //compare number of hets for this allele (and any other second allele) with the expectation based on AFs + //derive the altAF from the likelihoods to account for any accumulation of fractional counts from non-primary likelihoods, + //e.g. for a GQ10 variant, the probability of the call will be ~0.9 and the second best call will be ~0.1 so adding up those 0.1s for het counts can dramatically change the AF compared with integer counts + altAF = heterozygosityUtils.getAlleleCount(vc, altAllele)/ (double) AN; + F = 1.0 - (hetCount / (2.0 * (1 - altAF) * altAF * (double) heterozygosityUtils.getSampleCount())); // inbreeding coefficient + } + + return F; + } +} diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_QualByDepth.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_QualByDepth.java new file mode 100644 index 000000000..e5cad49a6 --- /dev/null +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_QualByDepth.java @@ -0,0 +1,203 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE +* SOFTWARE LICENSE AGREEMENT +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. PHONE-HOME FEATURE +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012-2015 Broad Institute, Inc. +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 5. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 6. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 7. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 8. MISCELLANEOUS +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.gatk.tools.walkers.annotator; + +import htsjdk.variant.variantcontext.Allele; +import htsjdk.variant.variantcontext.Genotype; +import htsjdk.variant.variantcontext.GenotypesContext; +import htsjdk.variant.variantcontext.VariantContext; +import htsjdk.variant.vcf.VCFInfoHeaderLine; +import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AS_StandardAnnotation; +import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible; +import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.InfoFieldAnnotation; +import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.ReducibleAnnotation; +import org.broadinstitute.gatk.utils.MathUtils; +import org.broadinstitute.gatk.utils.contexts.AlignmentContext; +import org.broadinstitute.gatk.utils.contexts.ReferenceContext; +import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap; +import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; +import org.broadinstitute.gatk.utils.variant.GATKVCFConstants; +import org.broadinstitute.gatk.utils.variant.GATKVCFHeaderLines; + +import java.util.*; + +/** + * Per-allele call confidence normalized by depth of sample reads supporting the allele + * + *

This annotation puts the variant confidence QUAL score into perspective by normalizing for the amount of coverage available. Because each read contributes a little to the QUAL score, variants in regions with deep coverage can have artificially inflated QUAL scores, giving the impression that the call is supported by more evidence than it really is. To compensate for this, we normalize the variant confidence by depth, which gives us a more objective picture of how well supported the call is.

+ * + *

Statistical notes

+ *

The QD is the QUAL score normalized by allele depth (AD) for a variant. For a single sample, the HaplotypeCaller calculates the QD by taking QUAL/AD. For multiple samples, HaplotypeCaller and GenotypeGVCFs calculate the QD by taking QUAL/AD of samples with a non hom-ref genotype call. The reason we leave out the samples with a hom-ref call is to not penalize the QUAL for the other samples with the variant call.

+ *

Here is a single sample example:

+ *
2	37629	.	C	G	1063.77	.	AC=2;AF=1.00;AN=2;DP=31;FS=0.000;MLEAC=2;MLEAF=1.00;MQ=58.50;QD=34.32;SOR=2.376	GT:AD:DP:GQ:PL:QSS	1/1:0,31:31:93:1092,93,0:0,960
+

QUAL/AD = 1063.77/31 = 34.32 = QD

+ *

Here is a multi-sample example:

+ *
10	8046	.	C	T	4107.13	.	AC=1;AF=0.167;AN=6;BaseQRankSum=-3.717;DP=1063;FS=1.616;MLEAC=1;MLEAF=0.167;QD=11.54
+ GT:AD:DP:GQ:PL:QSS	0/0:369,4:373:99:0,1007,12207:10548,98	    0/0:331,1:332:99:0,967,11125:9576,27	    0/1:192,164:356:99:4138,0,5291:5501,4505
+ *

QUAL/AD = 4107.13/356 = 11.54 = QD

+ *

Note that currently, when HaplotypeCaller is run with `-ERC GVCF`, the QD calculation is invoked before AD itself has been calculated, due to a technical constraint. In that case, HaplotypeCaller uses the number of overlapping reads from the haplotype likelihood calculation in place of AD to calculate QD, which generally yields a very similar number. This does not cause any measurable problems, but can cause some confusion since the number may be slightly different than what you would expect to get if you did the calculation manually. For that reason, this behavior will be modified in an upcoming version.

+ * + *

Caveat

+ *

This annotation can only be calculated for sites for which at least one sample was genotyped as carrying a variant allele.

+ * + *

Related annotations

+ *
    + *
  • Coverage gives the filtered depth of coverage for each sample and the unfiltered depth across all samples.
  • + *
  • DepthPerAlleleBySample calculates depth of coverage for each allele per sample (AD).
  • + *
+ */ +public class AS_QualByDepth extends InfoFieldAnnotation implements ReducibleAnnotation, AS_StandardAnnotation { + + @Override + public List getKeyNames() { return Arrays.asList(GATKVCFConstants.AS_QUAL_BY_DEPTH_KEY); } + + @Override + public String getRawKeyName() { return GATKVCFConstants.AS_QUAL_KEY; } + + public List getDescriptions() { + //We only have the finalized key name here because the raw key is internal to GenotypeGVCFs and won't get output in any VCF + return Arrays.asList(GATKVCFHeaderLines.getInfoLine(getKeyNames().get(0))); + } + + public Map annotate(final RefMetaDataTracker tracker, + final AnnotatorCompatible walker, + final ReferenceContext ref, + final Map stratifiedContexts, + final VariantContext vc, + final Map perReadAlleleLikelihoodMap ) { + return null; + } + + private List getAlleleDepths(final GenotypesContext genotypes) { + int numAlleles = -1; + for (final Genotype genotype : genotypes) { + if (genotype.hasAD()) { + numAlleles = genotype.getAD().length; + break; + } + } + if (numAlleles == -1) //no genotypes have AD + return null; + Integer[] alleleDepths = new Integer[numAlleles]; + for (int i = 0; i < alleleDepths.length; i++) { + alleleDepths[i] = 0; + } + for (final Genotype genotype : genotypes) { + // we care only about genotypes with variant alleles + if ( !genotype.isHet() && !genotype.isHomVar() ) + continue; + + // if we have the AD values for this sample, let's make sure that the variant depth is greater than 1! + if ( genotype.hasAD() ) { + final int[] AD = genotype.getAD(); + final int totalADdepth = (int) MathUtils.sum(AD); + if ( totalADdepth - AD[0] > 1 ) { + for (int i = 0; i < AD.length; i++) { + alleleDepths[i] += AD[i]; + } + } + } + } + return Arrays.asList(alleleDepths); + } + + @Override + public Map annotateRawData(RefMetaDataTracker tracker, AnnotatorCompatible walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc, Map stratifiedPerReadAlleleLikelihoodMap) { + return null; + } + + @Override + public Map combineRawData(List allelesList, List listOfRawData) { + return null; + } + + @Override + public Map finalizeRawData(VariantContext vc, VariantContext originalVC) { + //we need to use the AS_QUAL value that was added to the VC by the GenotypingEngine + if ( !vc.hasAttribute(GATKVCFConstants.AS_QUAL_KEY) ) + return null; + + final GenotypesContext genotypes = vc.getGenotypes(); + if ( genotypes == null || genotypes.isEmpty() ) + return null; + + final List standardDepth = getAlleleDepths(genotypes); + + //Parse the VC's allele-specific qual values + List alleleQualObjList = vc.getAttributeAsList(GATKVCFConstants.AS_QUAL_KEY); + if (alleleQualObjList.size() != vc.getNAlleles() -1) + throw new IllegalStateException("Number of AS_QUAL values doesn't match the number of alternate alleles."); + List alleleQualList = new ArrayList<>(); + for (final Object obj : alleleQualObjList) { + alleleQualList.add(Double.parseDouble(obj.toString())); + } + + // Don't normalize indel length for AS_QD because it will only be called from GenotypeGVCFs, never UG + List QDlist = new ArrayList<>(); + double refDepth = (double)standardDepth.get(0); + for (int i = 0; i < alleleQualList.size(); i++) { + double AS_QD = -10.0 * alleleQualList.get(i) / ((double)standardDepth.get(i+1) + refDepth); //+1 to skip the reference field of the AD, add ref counts to each to match biallelic case + // Hack: see note in the fixTooHighQD method below + AS_QD = QualByDepth.fixTooHighQD(AS_QD); + QDlist.add(AS_QD); + } + + final Map map = new HashMap<>(); + map.put(getKeyNames().get(0), AnnotationUtils.encodeValueList(QDlist, "%.2f")); + return map; + } + + @Override + public void calculateRawData(VariantContext vc, Map pralm, ReducibleAnnotationData rawAnnotations) { + //note that the "raw data" used here is calculated by the GenotypingEngine in GenotypeGVCFs and stored in the AS_QUAL info field + } +} diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AnnotationUtils.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AnnotationUtils.java index 843f95108..81e645a72 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AnnotationUtils.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AnnotationUtils.java @@ -56,6 +56,8 @@ import htsjdk.samtools.CigarElement; import htsjdk.samtools.CigarOperator; import htsjdk.samtools.SAMRecord; import htsjdk.variant.variantcontext.Genotype; +import htsjdk.variant.variantcontext.VariantContext; +import org.apache.commons.lang.StringUtils; import org.apache.log4j.Logger; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.gatk.tools.walkers.indels.PairHMMIndelErrorModel; @@ -66,11 +68,29 @@ import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.gatk.tools.walkers.haplotypecaller.HaplotypeCaller; import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + public class AnnotationUtils { public static final String ANNOTATION_HC_WARN_MSG = " annotation will not be calculated, must be called from HaplotypeCaller"; public static final int WARNINGS_LOGGED_SIZE = 3; + /** + * Helper function to parse the list into the annotation string + * @param valueList the ArrayList returned from StrandBiasBySample.annotate() + * @return the array used by the per-sample Strand Bias annotation + */ + protected static String encodeValueList( final List valueList, final String precisionFormat ) { + List outputList = new ArrayList<>(); + for (Double d : valueList) { + outputList.add(String.format(precisionFormat, d)); + } + return StringUtils.join(outputList, ","); + } + /** * Checks if the walker is compatible with allele-specific annotations */ @@ -153,6 +173,24 @@ public class AnnotationUtils { } + //this method is intended to reconcile uniquified sample names + // it comes into play when calling this annotation from GenotypeGVCFs with --uniquifySamples because founderIds + // is derived from the sampleDB, which comes from the input sample names, but vc will have uniquified (i.e. different) + // sample names. Without this check, the founderIds won't be found in the vc and the annotation won't be calculated. + protected static Set validateFounderIDs(final Set founderIds, final VariantContext vc) { + Set vcSamples = new HashSet<>(); + Set returnIDs = founderIds; + vcSamples.addAll(vc.getSampleNames()); + if (!vcSamples.isEmpty()) { + if (founderIds != null) { + vcSamples.removeAll(founderIds); + if (vcSamples.equals(vc.getSampleNames())) + returnIDs = vc.getSampleNames(); + } + } + return returnIDs; + } + /** * Get the position of a variant within a read with respect to the closer end, accounting for hard clipped bases and low quality ends * Used by ReadPosRankSum annotations diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/ClippingRankSumTest.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/ClippingRankSumTest.java index 4963555b4..c05d80b11 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/ClippingRankSumTest.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/ClippingRankSumTest.java @@ -52,6 +52,7 @@ package org.broadinstitute.gatk.tools.walkers.annotator; import htsjdk.variant.vcf.VCFInfoHeaderLine; +import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.StandardHCAnnotation; import org.broadinstitute.gatk.utils.sam.AlignmentUtils; import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; import org.broadinstitute.gatk.utils.variant.GATKVCFConstants; @@ -71,7 +72,7 @@ import java.util.*; *

The clipping rank sum test cannot be calculated for sites without a mixture of reads showing both the reference and alternate alleles.

* */ -public class ClippingRankSumTest extends RankSumTest { +public class ClippingRankSumTest extends RankSumTest implements StandardHCAnnotation{ @Override public List getKeyNames() { return Arrays.asList(GATKVCFConstants.CLIPPING_RANK_SUM_KEY); } diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/DepthPerSampleHC.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/DepthPerSampleHC.java index adb4dbbca..339d4fc3e 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/DepthPerSampleHC.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/DepthPerSampleHC.java @@ -56,6 +56,7 @@ import org.apache.log4j.Logger; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.GenotypeAnnotation; import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.StandardAnnotation; +import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.StandardHCAnnotation; import org.broadinstitute.gatk.utils.contexts.AlignmentContext; import org.broadinstitute.gatk.utils.contexts.ReferenceContext; import org.broadinstitute.gatk.utils.genotyper.MostLikelyAllele; @@ -93,7 +94,7 @@ import java.util.*; * * */ -public class DepthPerSampleHC extends GenotypeAnnotation { +public class DepthPerSampleHC extends GenotypeAnnotation implements StandardHCAnnotation{ private final static Logger logger = Logger.getLogger(DepthPerSampleHC.class); private boolean alleleLikelihoodMapSubsetWarningLogged = false; private final boolean[] warningsLogged = new boolean[AnnotationUtils.WARNINGS_LOGGED_SIZE]; diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/ExcessHet.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/ExcessHet.java index 31d158c6f..076d5bc26 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/ExcessHet.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/ExcessHet.java @@ -1,44 +1,44 @@ /* * By downloading the PROGRAM you agree to the following terms of use: -* +* * BROAD INSTITUTE * SOFTWARE LICENSE AGREEMENT * FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY -* +* * This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). -* +* * WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and * WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. * NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: -* +* * 1. DEFINITIONS * 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. -* +* * 2. LICENSE * 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. * The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. * 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. * 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. -* +* * 3. PHONE-HOME FEATURE * LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. -* +* * 4. OWNERSHIP OF INTELLECTUAL PROPERTY * LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. * Copyright 2012-2015 Broad Institute, Inc. * Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. * LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. -* +* * 5. INDEMNIFICATION * LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. -* +* * 6. NO REPRESENTATIONS OR WARRANTIES * THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. * IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. -* +* * 7. ASSIGNMENT * This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. -* +* * 8. MISCELLANEOUS * 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. * 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. @@ -52,8 +52,11 @@ package org.broadinstitute.gatk.tools.walkers.annotator; import htsjdk.variant.variantcontext.Allele; +import htsjdk.variant.vcf.VCFHeaderLine; import org.apache.commons.math.stat.StatUtils; import org.apache.log4j.Logger; +import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; +import org.broadinstitute.gatk.engine.walkers.Walker; import org.broadinstitute.gatk.utils.contexts.AlignmentContext; import org.broadinstitute.gatk.utils.contexts.ReferenceContext; import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; @@ -80,8 +83,20 @@ import java.util.*; */ public class ExcessHet extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation { private final static Logger logger = Logger.getLogger(ExcessHet.class); - private int sampleCount; private final double minNeededValue = 1.0E-16; + private Set founderIds; + private final boolean RETURN_ROUNDED = true; + private int sampleCount = -1; + + @Override + public void initialize ( AnnotatorCompatible walker, GenomeAnalysisEngine toolkit, Set headerLines ) { + //If available, get the founder IDs and cache them. The ExcessHet value will only be computed on founders then. + //excessHet respects pedigree files, but doesn't require a minimum number of samples + if(founderIds == null && walker != null) { + founderIds = ((Walker) walker).getSampleDB().getFounderIds(); + } + + } @Override public Map annotate(final RefMetaDataTracker tracker, @@ -95,59 +110,15 @@ public class ExcessHet extends InfoFieldAnnotation implements StandardAnnotation } protected double calculateEH(final VariantContext vc, final GenotypesContext genotypes) { - final boolean doMultiallelicMapping = !vc.isBiallelic(); - - int idxAA = 0, idxAB = 1, idxBB = 2; - - int refCount = 0; - int hetCount = 0; - int homCount = 0; - sampleCount = 0; // number of samples that have likelihoods - - for (final Genotype g : genotypes) { - if (g.isCalled() && g.hasLikelihoods() && g.getPloidy() == 2) // only work for diploid samples - sampleCount++; - else - continue; - - //Need to round the likelihoods to deal with small numerical deviations due to normalizing - final double[] normalizedLikelihoodsUnrounded = MathUtils.normalizeFromLog10(g.getLikelihoods().getAsVector()); - final int[] normalizedLikelihoods = new int[normalizedLikelihoodsUnrounded.length]; - for (int i = 0; i < normalizedLikelihoodsUnrounded.length; i++) { - normalizedLikelihoods[i] = (int) Math.round(normalizedLikelihoodsUnrounded[i]); - } - - if (doMultiallelicMapping) { - if (g.isHetNonRef()) { - //all likelihoods go to homCount - homCount++; - continue; - } - - //get alternate allele for each sample - final Allele a1 = g.getAllele(0); - final Allele a2 = g.getAllele(1); - if (a2.isNonReference()) { - final int[] idxVector = vc.getGLIndecesOfAlternateAllele(a2); - idxAA = idxVector[0]; - idxAB = idxVector[1]; - idxBB = idxVector[2]; - } - //I expect hets to be reference first, but there are no guarantees (e.g. phasing) - else if (a1.isNonReference()) { - final int[] idxVector = vc.getGLIndecesOfAlternateAllele(a1); - idxAA = idxVector[0]; - idxAB = idxVector[1]; - idxBB = idxVector[2]; - } - } - - refCount += normalizedLikelihoods[idxAA]; - hetCount += normalizedLikelihoods[idxAB]; - homCount += normalizedLikelihoods[idxBB]; + HeterozygosityUtils heterozygosityUtils = new HeterozygosityUtils(RETURN_ROUNDED); + final double[] genotypeCountsDoubles = heterozygosityUtils.getGenotypeCountsForRefVsAllAlts(vc, genotypes); + sampleCount = heterozygosityUtils.getSampleCount(); + final int[] genotypeCounts = new int[genotypeCountsDoubles.length]; + for(int i = 0; i < genotypeCountsDoubles.length; i++) { + genotypeCounts[i] = (int)genotypeCountsDoubles[i]; } - double pval = exactTest(hetCount, refCount, homCount); + double pval = exactTest(genotypeCounts); //If the actual phredPval would be infinity we will probably still filter out just a very large number if (pval == 0) { @@ -164,13 +135,21 @@ public class ExcessHet extends InfoFieldAnnotation implements StandardAnnotation * be more computationally expensive to calculate accuracy beyond a given threshold. Here we have enough accuracy * to filter anything below a p-value of 10E-6. * - * @param hetCount Number of observed hets (n_ab) - * @param refCount Number of observed homRefs (n_aa) - * @param homCount Number of observed homVars (n_bb) + * @param genotypeCounts Number of observed genotypes (n_aa, n_ab, n_bb) * @return Right sided p-value or the probability of getting the observed or higher number of hets given the sample * size (N) and the observed number of allele a (rareCopies) */ - protected double exactTest(final int hetCount, final int refCount, final int homCount) { + protected double exactTest(final int[] genotypeCounts) { + if (genotypeCounts.length != 3) { + throw new IllegalStateException("Input genotype counts must be length 3 for the number of genotypes with {2, 1, 0} ref alleles."); + } + final int REF_INDEX = 0; + final int HET_INDEX = 1; + final int VAR_INDEX = 2; + + final int refCount = genotypeCounts[REF_INDEX]; + final int hetCount = genotypeCounts[HET_INDEX]; + final int homCount = genotypeCounts[VAR_INDEX]; if (hetCount < 0 || refCount < 0 || homCount < 0) { throw new IllegalArgumentException("Genotype counts cannot be less than 0"); @@ -257,7 +236,7 @@ public class ExcessHet extends InfoFieldAnnotation implements StandardAnnotation } protected Map makeEHAnnotation(final VariantContext vc) { - final GenotypesContext genotypes = vc.getGenotypes(); + final GenotypesContext genotypes = (founderIds == null || founderIds.isEmpty()) ? vc.getGenotypes() : vc.getGenotypes(founderIds); if (genotypes == null || !vc.isVariant()) return null; double EH = calculateEH(vc, genotypes); diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/HeterozygosityUtils.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/HeterozygosityUtils.java new file mode 100644 index 000000000..ee45f9314 --- /dev/null +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/HeterozygosityUtils.java @@ -0,0 +1,236 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE +* SOFTWARE LICENSE AGREEMENT +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. PHONE-HOME FEATURE +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012-2015 Broad Institute, Inc. +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 5. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 6. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 7. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 8. MISCELLANEOUS +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.gatk.tools.walkers.annotator; + +import htsjdk.variant.variantcontext.*; +import org.broadinstitute.gatk.utils.MathUtils; +import java.util.HashMap; +import java.util.Map; + +/** + * A class containing utility methods used in the calculation of annotations related to cohort heterozygosity, e.g. InbreedingCoefficient and ExcessHet + * Stores sample count to make sure we never have to iterate the genotypes more than once + * Should be reinitialized for each VariantContext + */ +public class HeterozygosityUtils { + + final public static int REF_INDEX = 0; + final public static int HET_INDEX = 1; + final public static int VAR_INDEX = 2; + + protected int sampleCount = -1; + private Map hetCounts; + private Map alleleCounts; + boolean returnRounded = false; + + /** + * Create a new HeterozygosityUtils -- a new class should be instantiated for each VariantContext to store data for that VC + * @param returnRounded round the likelihoods to return integer numbers of counts (as doubles) + */ + protected HeterozygosityUtils(final boolean returnRounded) { + this.returnRounded = returnRounded; + } + + /** + * Get the genotype counts for A/A, A/B, and B/B where A is the reference and B is any alternate allele + * @param vc + * @param genotypes may be subset to just founders if a pedigree file is provided + * @return may be null, otherwise length-3 double[] representing homRef, het, and homVar counts + */ + protected double[] getGenotypeCountsForRefVsAllAlts(final VariantContext vc, final GenotypesContext genotypes) { + if (genotypes == null || !vc.isVariant()) + return null; + + final boolean doMultiallelicMapping = !vc.isBiallelic(); + + int idxAA = 0, idxAB = 1, idxBB = 2; + + double refCount = 0; + double hetCount = 0; + double homCount = 0; + + sampleCount = 0; + for (final Genotype g : genotypes) { + if (g.isCalled() && g.hasLikelihoods() && g.getPloidy() == 2) // only work for diploid samples + sampleCount++; + else + continue; + + //Need to round the likelihoods to deal with small numerical deviations due to normalizing + final double[] normalizedLikelihoodsUnrounded = MathUtils.normalizeFromLog10(g.getLikelihoods().getAsVector()); + double[] normalizedLikelihoods = new double[normalizedLikelihoodsUnrounded.length]; + if (returnRounded) { + for (int i = 0; i < normalizedLikelihoodsUnrounded.length; i++) { + normalizedLikelihoods[i] = Math.round(normalizedLikelihoodsUnrounded[i]); + } + } else { + normalizedLikelihoods = normalizedLikelihoodsUnrounded; + } + + if (doMultiallelicMapping) { + if (g.isHetNonRef()) { + //all likelihoods go to homCount + homCount++; + continue; + } + + if (!g.isHomRef()) { + //get alternate allele for each sample + final Allele a1 = g.getAllele(0); + final Allele a2 = g.getAllele(1); + final int[] idxVector = vc.getGLIndecesOfAlternateAllele(a2.isNonReference() ? a2 : a1); + idxAA = idxVector[0]; + idxAB = idxVector[1]; + idxBB = idxVector[2]; + } + } + + refCount += normalizedLikelihoods[idxAA]; + hetCount += normalizedLikelihoods[idxAB]; + homCount += normalizedLikelihoods[idxBB]; + } + return new double[]{refCount, hetCount, homCount}; + } + + /** + * Get the count of heterozygotes in vc for a specific altAllele (both reference and non-reference hets, e.g. 1/2) + * @param vc + */ + protected void doGenotypeCalculations(final VariantContext vc) { + final GenotypesContext genotypes = vc.getGenotypes(); + if (genotypes == null || !vc.isVariant()) + return; + + final int numAlleles = vc.getNAlleles(); + + sampleCount = 0; + if (hetCounts == null && alleleCounts == null) { + hetCounts = new HashMap<>(); + alleleCounts = new HashMap<>(); + for (final Allele a : vc.getAlleles()) { + if (a.isNonReference()) + hetCounts.put(a, 0.0); + alleleCounts.put(a, 0.0); + } + + int idxAB; + + //for each sample + for (final Genotype g : genotypes) { + if (g.isCalled() && g.hasLikelihoods() && g.getPloidy() == 2) // only work for diploid samples + sampleCount++; + else + continue; + + int altIndex = 0; + for(final Allele a : vc.getAlternateAlleles()) { + //for each alt allele index from 1 to N + altIndex++; + + final double[] normalizedLikelihoodsUnrounded = MathUtils.normalizeFromLog10(g.getLikelihoods().getAsVector()); + double[] normalizedLikelihoods = new double[normalizedLikelihoodsUnrounded.length]; + if (returnRounded) { + for (int i = 0; i < normalizedLikelihoodsUnrounded.length; i++) { + normalizedLikelihoods[i] = Math.round(normalizedLikelihoodsUnrounded[i]); + } + } else { + normalizedLikelihoods = normalizedLikelihoodsUnrounded; + } + //iterate over the other alleles + for (int i = 0; i < numAlleles; i++) { + //only add homozygotes to alleleCounts, not hetCounts + if (i == altIndex) { + final double currentAlleleCounts = alleleCounts.get(a); + alleleCounts.put(a, currentAlleleCounts + 2*normalizedLikelihoods[GenotypeLikelihoods.calculatePLindex(altIndex,altIndex)]); + continue; + } + //pull out the heterozygote PL index, ensuring that the first allele index < second allele index + idxAB = GenotypeLikelihoods.calculatePLindex(Math.min(i,altIndex),Math.max(i,altIndex)); + final double aHetCounts = hetCounts.get(a); + hetCounts.put(a, aHetCounts + normalizedLikelihoods[idxAB]); + final double currentAlleleCounts = alleleCounts.get(a); + //these are guaranteed to be hets + alleleCounts.put(a, currentAlleleCounts + normalizedLikelihoods[idxAB]); + final double refAlleleCounts = alleleCounts.get(vc.getReference()); + alleleCounts.put(vc.getReference(), refAlleleCounts + normalizedLikelihoods[idxAB]); + } + //add in ref/ref likelihood + final double refAlleleCounts = alleleCounts.get(vc.getReference()); + alleleCounts.put(vc.getReference(), refAlleleCounts + 2*normalizedLikelihoods[0]); + } + + } + } + } + + /** + * Get the count of heterozygotes in vc for a specific altAllele (both reference and non-reference hets, e.g. 1/2) + * @param vc + * @param altAllele the alternate allele of interest + * @return number of hets + */ + protected double getHetCount(final VariantContext vc, final Allele altAllele) { + if (hetCounts == null) + doGenotypeCalculations(vc); + return hetCounts.containsKey(altAllele)? hetCounts.get(altAllele) : 0; + } + + protected double getAlleleCount(final VariantContext vc, final Allele allele) { + if (alleleCounts == null) + doGenotypeCalculations(vc); + return alleleCounts.containsKey(allele)? alleleCounts.get(allele) : 0; + } + + protected int getSampleCount() { + return sampleCount; + } +} diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/InbreedingCoeff.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/InbreedingCoeff.java index a4ddd71db..28c707e47 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/InbreedingCoeff.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/InbreedingCoeff.java @@ -52,19 +52,16 @@ package org.broadinstitute.gatk.tools.walkers.annotator; import htsjdk.variant.variantcontext.Allele; +import htsjdk.variant.vcf.VCFHeaderLine; import org.apache.log4j.Logger; +import org.broadinstitute.gatk.engine.GenomeAnalysisEngine; +import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.*; import org.broadinstitute.gatk.utils.contexts.AlignmentContext; import org.broadinstitute.gatk.utils.contexts.ReferenceContext; import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker; import org.broadinstitute.gatk.engine.walkers.Walker; -import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.ActiveRegionBasedAnnotation; -import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible; -import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.InfoFieldAnnotation; -import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.StandardAnnotation; import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap; -import org.broadinstitute.gatk.utils.MathUtils; import htsjdk.variant.vcf.VCFInfoHeaderLine; -import htsjdk.variant.variantcontext.Genotype; import htsjdk.variant.variantcontext.GenotypesContext; import htsjdk.variant.variantcontext.VariantContext; import org.broadinstitute.gatk.utils.variant.GATKVCFConstants; @@ -79,24 +76,36 @@ import java.util.*; *

This annotation estimates whether there is evidence of inbreeding in a population. The higher the score, the higher the chance that there is inbreeding.

* *

Statistical notes

- *

The calculation is a continuous generalization of the Hardy-Weinberg test for disequilibrium that works well with limited coverage per sample. The output is a Phred-scaled p-value derived from running the HW test for disequilibrium with PL values. See the method document on statistical tests for a more detailed explanation of this statistical test.

+ *

The calculation is a continuous generalization of the Hardy-Weinberg test for disequilibrium that works well with limited coverage per sample. The output is the F statistic from running the HW test for disequilibrium with PL values. See the method document on statistical tests for a more detailed explanation of this statistical test.

* *

Caveats

*
    *
  • The Inbreeding Coefficient can only be calculated for cohorts containing at least 10 founder samples.
  • - *
  • This annotation is used in variant recalibration, but may not be appropriate for that purpose if the cohort being analyzed contains many closely related individuals.
  • - *
  • This annotation requires a valid pedigree file.
  • + *
  • This annotation is used in variant filtering, but may not be appropriate for that purpose if the cohort being analyzed contains many closely related individuals.
  • + *
  • This annotation can take a valid pedigree file to specify founders.
  • *
* */ -public class InbreedingCoeff extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation { +public class InbreedingCoeff extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation, ReducibleAnnotation { private final static Logger logger = Logger.getLogger(InbreedingCoeff.class); - private static final int MIN_SAMPLES = 10; + protected static final int MIN_SAMPLES = 10; private Set founderIds; - private int sampleCount; - private boolean pedigreeCheckWarningLogged = false; private boolean didUniquifiedSampleNameCheck = false; + protected HeterozygosityUtils heterozygosityUtils; + final private boolean RETURN_ROUNDED = false; + + @Override + public void initialize (final AnnotatorCompatible walker, final GenomeAnalysisEngine toolkit, final Set headerLines ) { + //If available, get the founder IDs and cache them. the IC will only be computed on founders then. + if(founderIds == null && walker != null) { + founderIds = ((Walker) walker).getSampleDB().getFounderIds(); + } + if(walker != null && (((Walker) walker).getSampleDB().getSamples().size() < MIN_SAMPLES || (!founderIds.isEmpty() && founderIds.size() < MIN_SAMPLES))) + logger.warn("Annotation will not be calculated. InbreedingCoeff requires at least " + MIN_SAMPLES + " unrelated samples."); + //intialize a HeterozygosityUtils before annotating for use in unit tests + heterozygosityUtils = new HeterozygosityUtils(RETURN_ROUNDED); + } @Override public Map annotate(final RefMetaDataTracker tracker, @@ -105,78 +114,59 @@ public class InbreedingCoeff extends InfoFieldAnnotation implements StandardAnno final Map stratifiedContexts, final VariantContext vc, final Map perReadAlleleLikelihoodMap ) { - //If available, get the founder IDs and cache them. the IC will only be computed on founders then. - if(founderIds == null && walker != null) { - founderIds = ((Walker) walker).getSampleDB().getFounderIds(); - } + + heterozygosityUtils = new HeterozygosityUtils(RETURN_ROUNDED); + //if none of the "founders" are in the vc samples, assume we uniquified the samples upstream and they are all founders if (!didUniquifiedSampleNameCheck) { - checkSampleNames(vc); + founderIds = AnnotationUtils.validateFounderIDs(founderIds, vc); didUniquifiedSampleNameCheck = true; } - if ( founderIds == null || founderIds.isEmpty() ) { - if ( !pedigreeCheckWarningLogged ) { - logger.warn("Annotation will not be calculated, must provide a valid PED file (-ped) from the command line."); - pedigreeCheckWarningLogged = true; - } - return null; - } - else{ - return makeCoeffAnnotation(vc); + return makeCoeffAnnotation(vc); + } + + //Inbreeding coeff doesn't need raw data -- it's calculated from the final genotypes + @Override + public String getRawKeyName() { return null; } + + @Override + public Map annotateRawData(final RefMetaDataTracker tracker, final AnnotatorCompatible walker, final ReferenceContext ref, final Map stratifiedContexts, final VariantContext vc, final Map stratifiedPerReadAlleleLikelihoodMap) { + return null; + } + + @Override + public void calculateRawData(final VariantContext vc, final Map pralm, final ReducibleAnnotationData rawAnnotations) { } + + @Override + public Map combineRawData(final List allelesList, final List listOfRawData) { + return null; + } + + @Override + public Map finalizeRawData(final VariantContext vc, final VariantContext originalVC) { + heterozygosityUtils = new HeterozygosityUtils(RETURN_ROUNDED); + + //if none of the "founders" are in the vc samples, assume we uniquified the samples upstream and they are all founders + if (!didUniquifiedSampleNameCheck) { + founderIds = AnnotationUtils.validateFounderIDs(founderIds, vc); + didUniquifiedSampleNameCheck = true; } + return makeCoeffAnnotation(vc); } protected double calculateIC(final VariantContext vc, final GenotypesContext genotypes) { - final boolean doMultiallelicMapping = !vc.isBiallelic(); - - int idxAA = 0, idxAB = 1, idxBB = 2; - - double refCount = 0.0; - double hetCount = 0.0; - double homCount = 0.0; - sampleCount = 0; // number of samples that have likelihoods - - for ( final Genotype g : genotypes ) { - if ( g.isCalled() && g.hasLikelihoods() && g.getPloidy() == 2) // only work for diploid samples - sampleCount++; - else - continue; - final double[] normalizedLikelihoods = MathUtils.normalizeFromLog10( g.getLikelihoods().getAsVector() ); - if (doMultiallelicMapping) - { - if (g.isHetNonRef()) { - //all likelihoods go to homCount - homCount++; - continue; - } - - //get alternate allele for each sample - final Allele a1 = g.getAllele(0); - final Allele a2 = g.getAllele(1); - if (a2.isNonReference()) { - final int[] idxVector = vc.getGLIndecesOfAlternateAllele(a2); - idxAA = idxVector[0]; - idxAB = idxVector[1]; - idxBB = idxVector[2]; - } - //I expect hets to be reference first, but there are no guarantees (e.g. phasing) - else if (a1.isNonReference()) { - final int[] idxVector = vc.getGLIndecesOfAlternateAllele(a1); - idxAA = idxVector[0]; - idxAB = idxVector[1]; - idxBB = idxVector[2]; - } - } - - refCount += normalizedLikelihoods[idxAA]; - hetCount += normalizedLikelihoods[idxAB]; - homCount += normalizedLikelihoods[idxBB]; + final double[] genotypeCounts = heterozygosityUtils.getGenotypeCountsForRefVsAllAlts(vc, genotypes); //guarantees that sampleCount is set + if (genotypeCounts.length != 3) { + throw new IllegalStateException("Input genotype counts must be length 3 for the number of genotypes with {2, 1, 0} ref alleles."); } + final double refCount = genotypeCounts[HeterozygosityUtils.REF_INDEX]; + final double hetCount = genotypeCounts[HeterozygosityUtils.HET_INDEX]; + final double homCount = genotypeCounts[HeterozygosityUtils.VAR_INDEX]; final double p = ( 2.0 * refCount + hetCount ) / ( 2.0 * (refCount + hetCount + homCount) ); // expected reference allele frequency final double q = 1.0 - p; // expected alternative allele frequency - final double F = 1.0 - ( hetCount / ( 2.0 * p * q * (double) sampleCount) ); // inbreeding coefficient + final double F = 1.0 - ( hetCount / ( 2.0 * p * q * (double) heterozygosityUtils.getSampleCount()) ); // inbreeding coefficient return F; } @@ -185,27 +175,13 @@ public class InbreedingCoeff extends InfoFieldAnnotation implements StandardAnno final GenotypesContext genotypes = (founderIds == null || founderIds.isEmpty()) ? vc.getGenotypes() : vc.getGenotypes(founderIds); if (genotypes == null || genotypes.size() < MIN_SAMPLES || !vc.isVariant()) return null; - double F = calculateIC(vc, genotypes); - if (sampleCount < MIN_SAMPLES) + final double F = calculateIC(vc, genotypes); + if (heterozygosityUtils.getSampleCount() < MIN_SAMPLES) return null; return Collections.singletonMap(getKeyNames().get(0), (Object)String.format("%.4f", F)); } - //this method is intended to reconcile uniquified sample names - // it comes into play when calling this annotation from GenotypeGVCFs with --uniquifySamples because founderIds - // is derived from the sampleDB, which comes from the input sample names, but vc will have uniquified (i.e. different) - // sample names. Without this check, the founderIds won't be found in the vc and the annotation won't be calculated. - protected void checkSampleNames(final VariantContext vc) { - Set vcSamples = new HashSet<>(); - vcSamples.addAll(vc.getSampleNames()); - if (!vcSamples.isEmpty()) { - if (founderIds!=null) { - vcSamples.removeAll(founderIds); - if (vcSamples.equals(vc.getSampleNames())) - founderIds = vc.getSampleNames(); - } - } - } + @Override public List getKeyNames() { return Collections.singletonList(GATKVCFConstants.INBREEDING_COEFFICIENT_KEY); } diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/QualByDepth.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/QualByDepth.java index b25631969..751b02d1b 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/QualByDepth.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/QualByDepth.java @@ -73,7 +73,7 @@ import htsjdk.variant.variantcontext.VariantContext; import java.util.*; /** - * Variant confidence normalized by unfiltered depth of variant samples + * Variant confidence normalized by depth of variant samples * *

This annotation puts the variant confidence QUAL score into perspective by normalizing for the amount of coverage available. Because each read contributes a little to the QUAL score, variants in regions with deep coverage can have artificially inflated QUAL scores, giving the impression that the call is supported by more evidence than it really is. To compensate for this, we normalize the variant confidence by depth, which gives us a more objective picture of how well supported the call is.

* @@ -86,7 +86,6 @@ import java.util.*; *
10	8046	.	C	T	4107.13	.	AC=1;AF=0.167;AN=6;BaseQRankSum=-3.717;DP=1063;FS=1.616;MLEAC=1;MLEAF=0.167;QD=11.54
    GT:AD:DP:GQ:PL:QSS	0/0:369,4:373:99:0,1007,12207:10548,98	    0/0:331,1:332:99:0,967,11125:9576,27	    0/1:192,164:356:99:4138,0,5291:5501,4505
*

QUAL/AD = 4107.13/356 = 11.54 = QD

- *

Note that currently, when HaplotypeCaller is run with `-ERC GVCF`, the QD calculation is invoked before AD itself has been calculated, due to a technical constraint. In that case, HaplotypeCaller uses the number of overlapping reads from the haplotype likelihood calculation in place of AD to calculate QD, which generally yields a very similar number. This does not cause any measurable problems, but can cause some confusion since the number may be slightly different than what you would expect to get if you did the calculation manually. For that reason, this behavior will be modified in an upcoming version.

* *

Caveat

*

This annotation can only be calculated for sites for which at least one sample was genotyped as carrying a variant allele.

@@ -100,6 +99,7 @@ import java.util.*; public class QualByDepth extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation { // private final static Logger logger = Logger.getLogger(QualByDepth.class); + @Override public Map annotate(final RefMetaDataTracker tracker, final AnnotatorCompatible walker, final ReferenceContext ref, @@ -113,6 +113,25 @@ public class QualByDepth extends InfoFieldAnnotation implements StandardAnnotati if ( genotypes == null || genotypes.size() == 0 ) return null; + final int standardDepth = getDepth(genotypes, stratifiedContexts, perReadAlleleLikelihoodMap); + + if ( standardDepth == 0 ) + return null; + + final double altAlleleLength = GATKVariantContextUtils.getMeanAltAlleleLength(vc); + // Hack: UnifiedGenotyper (but not HaplotypeCaller or GenotypeGVCFs) over-estimates the quality of long indels + // Penalize the QD calculation for UG indels to compensate for this + double QD = -10.0 * vc.getLog10PError() / ((double)standardDepth * indelNormalizationFactor(altAlleleLength, walker instanceof UnifiedGenotyper)); + + // Hack: see note in the fixTooHighQD method below + QD = fixTooHighQD(QD); + + final Map map = new HashMap<>(); + map.put(getKeyNames().get(0), String.format("%.2f", QD)); + return map; + } + + protected int getDepth(final GenotypesContext genotypes, final Map stratifiedContexts, final Map perReadAlleleLikelihoodMap) { int standardDepth = 0; int ADrestrictedDepth = 0; @@ -153,20 +172,7 @@ public class QualByDepth extends InfoFieldAnnotation implements StandardAnnotati if ( ADrestrictedDepth > 0 ) standardDepth = ADrestrictedDepth; - if ( standardDepth == 0 ) - return null; - - final double altAlleleLength = GATKVariantContextUtils.getMeanAltAlleleLength(vc); - // Hack: UnifiedGenotyper (but not HaplotypeCaller or GenotypeGVCFs) over-estimates the quality of long indels - // Penalize the QD calculation for UG indels to compensate for this - double QD = -10.0 * vc.getLog10PError() / ((double)standardDepth * indelNormalizationFactor(altAlleleLength, walker instanceof UnifiedGenotyper)); - - // Hack: see note in the fixTooHighQD method below - QD = fixTooHighQD(QD); - - final Map map = new HashMap<>(); - map.put(getKeyNames().get(0), String.format("%.2f", QD)); - return map; + return standardDepth; } /** @@ -174,7 +180,7 @@ public class QualByDepth extends InfoFieldAnnotation implements StandardAnnotati * * @param altAlleleLength the average alternate allele length for the call * @param increaseNormalizationAsLengthIncreases should we apply a normalization factor based on the allele length? - * @return a possitive double + * @return a positive double */ private double indelNormalizationFactor(final double altAlleleLength, final boolean increaseNormalizationAsLengthIncreases) { return ( increaseNormalizationAsLengthIncreases ? Math.max(altAlleleLength / 3.0, 1.0) : 1.0); @@ -186,12 +192,10 @@ public class QualByDepth extends InfoFieldAnnotation implements StandardAnnotati * and VQSR will filter these out. This code looks at the QD value, and if it is above * threshold we map it down to the mean high QD value, with some jittering * - * // TODO -- remove me when HaplotypeCaller bubble caller is live - * * @param QD the raw QD score * @return a QD value */ - private double fixTooHighQD(final double QD) { + protected static double fixTooHighQD(final double QD) { if ( QD < MAX_QD_BEFORE_FIXING ) { return QD; } else { @@ -199,12 +203,14 @@ public class QualByDepth extends InfoFieldAnnotation implements StandardAnnotati } } - private final static double MAX_QD_BEFORE_FIXING = 35; - private final static double IDEAL_HIGH_QD = 30; - private final static double JITTER_SIGMA = 3; + protected final static double MAX_QD_BEFORE_FIXING = 35; + protected final static double IDEAL_HIGH_QD = 30; + protected final static double JITTER_SIGMA = 3; + @Override public List getKeyNames() { return Arrays.asList(GATKVCFConstants.QUAL_BY_DEPTH_KEY); } + @Override public List getDescriptions() { return Arrays.asList(GATKVCFHeaderLines.getInfoLine(getKeyNames().get(0))); } diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/interfaces/StandardHCAnnotation.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/interfaces/StandardHCAnnotation.java new file mode 100644 index 000000000..65adb6989 --- /dev/null +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/interfaces/StandardHCAnnotation.java @@ -0,0 +1,57 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE +* SOFTWARE LICENSE AGREEMENT +* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. PHONE-HOME FEATURE +* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation. +* +* 4. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012-2015 Broad Institute, Inc. +* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 5. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 6. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 7. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 8. MISCELLANEOUS +* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.gatk.tools.walkers.annotator.interfaces; + +/** + * Annotations implementing this interface will be default for HaplotypeCaller + */ +public interface StandardHCAnnotation extends AnnotationType {} diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypingEngine.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypingEngine.java index c3f44a1f0..657054953 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypingEngine.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/GenotypingEngine.java @@ -112,10 +112,11 @@ public abstract class GenotypingEngine getAppropriateVCFInfoHeaders() { - Set headerInfo = new HashSet<>(); + final Set headerInfo = new HashSet<>(); if ( configuration.genotypeArgs.ANNOTATE_NUMBER_OF_ALLELES_DISCOVERED ) headerInfo.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.NUMBER_OF_DISCOVERED_ALLELES_KEY)); return headerInfo; @@ -191,7 +197,7 @@ public abstract class GenotypingEngine stratifiedContexts, final VariantContext vc, final GenotypeLikelihoodsCalculationModel.Model model, final boolean inheritAttributesFromInputVC, - final Map perReadAlleleLikelihoodMap) { + final Map perReadAlleleLikelihoodMap, + final boolean doAlleleSpecificCalcs) { final boolean limitedContext = tracker == null || refContext == null || rawContext == null || stratifiedContexts == null; // if input VC can't be genotyped, exit with either null VCC or, in case where we need to emit all sites, an empty call @@ -227,7 +234,7 @@ public abstract class GenotypingEngine -0.0 which isn't nice - double log10Confidence = + final double log10Confidence = ! outputAlternativeAlleles.siteIsMonomorphic || configuration.genotypingOutputMode == GenotypingOutputMode.GENOTYPE_GIVEN_ALLELES || configuration.annotateAllSitesWithPLs ? AFresult.getLog10PosteriorOfAFEq0() + 0.0 @@ -241,7 +248,7 @@ public abstract class GenotypingEngine attributes = composeCallAttributes(inheritAttributesFromInputVC, vc, rawContext, stratifiedContexts, tracker, refContext, - outputAlternativeAlleles.alternativeAlleleMLECounts(), outputAlternativeAlleles.siteIsMonomorphic, AFresult, outputAlternativeAlleles.outputAlleles(vc.getReference()),genotypes,model,perReadAlleleLikelihoodMap); + outputAlternativeAlleles.alternativeAlleleMLECounts(), outputAlternativeAlleles.siteIsMonomorphic, AFresult, outputAlternativeAlleles.outputAlleles(vc.getReference()), genotypes, model, perReadAlleleLikelihoodMap, doAlleleSpecificCalcs); builder.attributes(attributes); @@ -337,8 +344,8 @@ public abstract class GenotypingEngine alleles = afcr.getAllelesUsedInGenotyping(); final int alternativeAlleleCount = alleles.size() - 1; - Allele[] outputAlleles = new Allele[alternativeAlleleCount]; - int[] mleCounts = new int[alternativeAlleleCount]; + final Allele[] outputAlleles = new Allele[alternativeAlleleCount]; + final int[] mleCounts = new int[alternativeAlleleCount]; int outputAlleleCount = 0; boolean siteIsMonomorphic = true; for (final Allele alternativeAllele : alleles) { @@ -615,7 +622,8 @@ public abstract class GenotypingEngine composeCallAttributes(final boolean inheritAttributesFromInputVC, final VariantContext vc, final AlignmentContext rawContext, final Map stratifiedContexts, final RefMetaDataTracker tracker, final ReferenceContext refContext, final List alleleCountsofMLE, final boolean bestGuessIsRef, final AFCalculationResult AFresult, final List allAllelesToUse, final GenotypesContext genotypes, - final GenotypeLikelihoodsCalculationModel.Model model, final Map perReadAlleleLikelihoodMap) { + final GenotypeLikelihoodsCalculationModel.Model model, final Map perReadAlleleLikelihoodMap, + final boolean doAlleleSpecificCalcs) { final HashMap attributes = new HashMap<>(); final boolean limitedContext = tracker == null || refContext == null || rawContext == null || stratifiedContexts == null; @@ -637,6 +645,22 @@ public abstract class GenotypingEngine perAlleleQuals = new ArrayList<>(); + //Per-allele quals are not calculated for biallelic sites + if (AFresult.getAllelesUsedInGenotyping().size() > 2) { + for (final Allele a : allAllelesToUse) { + if (a.isNonReference()) + perAlleleQuals.add(AFresult.getLog10PosteriorOfAFEq0ForAllele(a)); + } + } + else { + perAlleleQuals.add(AFresult.getLog10PosteriorOfAFEq0()); + } + + attributes.put(GATKVCFConstants.AS_QUAL_KEY, perAlleleQuals); + } return attributes; } diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotypingEngine.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotypingEngine.java index 7abbd432e..b486d6b6c 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotypingEngine.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/genotyper/UnifiedGenotypingEngine.java @@ -96,6 +96,8 @@ public class UnifiedGenotypingEngine extends GenotypingEngine stratifiedContexts = getFilteredAndStratifiedContexts(refContext, rawContext, model); - return calculateGenotypes(tracker, refContext, rawContext, stratifiedContexts, vc, model, null); + return calculateGenotypes(tracker, refContext, rawContext, stratifiedContexts, vc, model, null, false); } /** @@ -305,7 +327,7 @@ public class UnifiedGenotypingEngine extends GenotypingEngine stratifiedContexts, + final VariantContext vc, final GenotypeLikelihoodsCalculationModel.Model model, + final boolean inheritAttributesFromInputVC, + final Map perReadAlleleLikelihoodMap) { + return calculateGenotypes(tracker, refContext, rawContext, stratifiedContexts, vc, model, inheritAttributesFromInputVC, perReadAlleleLikelihoodMap, false); } public VariantCallContext calculateGenotypes(final RefMetaDataTracker tracker, @@ -340,8 +370,9 @@ public class UnifiedGenotypingEngine extends GenotypingEngine stratifiedContexts, final VariantContext vc, final GenotypeLikelihoodsCalculationModel.Model model, - final Map perReadAlleleLikelihoodMap) { - return calculateGenotypes(tracker, refContext, rawContext, stratifiedContexts, vc, model, false, perReadAlleleLikelihoodMap); + final Map perReadAlleleLikelihoodMap, + final boolean useAlleleSpecificCalcs) { + return calculateGenotypes(tracker, refContext, rawContext, stratifiedContexts, vc, model, false, perReadAlleleLikelihoodMap, useAlleleSpecificCalcs); } @Override @@ -354,9 +385,10 @@ public class UnifiedGenotypingEngine extends GenotypingEngine stratifiedContexts, final VariantContext vc, final GenotypeLikelihoodsCalculationModel.Model model, final boolean inheritAttributesFromInputVC, - final Map perReadAlleleLikelihoodMap) { + final Map perReadAlleleLikelihoodMap, + final boolean useAlleleSpecificCalcs) { boolean limitedContext = tracker == null || refContext == null || rawContext == null || stratifiedContexts == null; - final VariantCallContext result = super.calculateGenotypes(tracker,refContext,rawContext,stratifiedContexts,vc,model,inheritAttributesFromInputVC,perReadAlleleLikelihoodMap); + final VariantCallContext result = super.calculateGenotypes(tracker,refContext,rawContext,stratifiedContexts,vc,model,inheritAttributesFromInputVC,perReadAlleleLikelihoodMap, useAlleleSpecificCalcs); if ( verboseWriter != null && !limitedContext ) printVerboseData(refContext.getLocus().toString(), vc, model); return result; @@ -377,9 +409,10 @@ public class UnifiedGenotypingEngine extends GenotypingEngine composeCallAttributes(final boolean inheritAttributesFromInputVC, final VariantContext vc, final AlignmentContext rawContext, final Map stratifiedContexts, final RefMetaDataTracker tracker, final ReferenceContext refContext, final List alleleCountsofMLE, final boolean bestGuessIsRef, final AFCalculationResult AFresult, final List allAllelesToUse, final GenotypesContext genotypes, - final GenotypeLikelihoodsCalculationModel.Model model, final Map perReadAlleleLikelihoodMap) { + final GenotypeLikelihoodsCalculationModel.Model model, final Map perReadAlleleLikelihoodMap, + final boolean useAlleleSpecificCalcs) { final Map result = super.composeCallAttributes(inheritAttributesFromInputVC, vc,rawContext,stratifiedContexts,tracker,refContext,alleleCountsofMLE,bestGuessIsRef, - AFresult,allAllelesToUse,genotypes,model,perReadAlleleLikelihoodMap); + AFresult,allAllelesToUse,genotypes,model,perReadAlleleLikelihoodMap, useAlleleSpecificCalcs); final boolean limitedContext = tracker == null || refContext == null || rawContext == null || stratifiedContexts == null; diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCaller.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCaller.java index d37f1946e..25ff347ee 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCaller.java @@ -290,7 +290,7 @@ public class HaplotypeCaller extends ActiveRegionWalker, In */ @Advanced @Argument(fullName="annotation", shortName="A", doc="One or more specific annotations to apply to variant calls", required=false) - protected List annotationsToUse = new ArrayList<>(Arrays.asList(new String[]{"ClippingRankSumTest", "DepthPerSampleHC"})); + protected List annotationsToUse = new ArrayList<>(); /** * Which annotations to exclude from output in the VCF file. Note that this argument has higher priority than the @@ -302,7 +302,7 @@ public class HaplotypeCaller extends ActiveRegionWalker, In */ @Advanced @Argument(fullName="excludeAnnotation", shortName="XA", doc="One or more specific annotations to exclude", required=false) - protected List annotationsToExclude = new ArrayList<>(Arrays.asList(new String[]{})); + protected List annotationsToExclude = new ArrayList<>(); /** * Which groups of annotations to add to the output VCF file. The single value 'none' removes the default group. See @@ -311,7 +311,7 @@ public class HaplotypeCaller extends ActiveRegionWalker, In * to provide a pedigree file for a pedigree-based annotation) may cause the run to fail. */ @Argument(fullName="group", shortName="G", doc="One or more classes/groups of annotations to apply to variant calls", required=false) - protected String[] annotationGroupsToUse = { "Standard" }; + protected List annotationGroupsToUse = new ArrayList<>(Arrays.asList(new String[]{ "Standard", "StandardHCAnnotation" })); @ArgumentCollection private HaplotypeCallerArgumentCollection HCAC = new HaplotypeCallerArgumentCollection(); @@ -623,7 +623,7 @@ public class HaplotypeCaller extends ActiveRegionWalker, In genotypingEngine = new HaplotypeCallerGenotypingEngine(HCAC, samplesList, genomeLocParser, FixedAFCalculatorProvider.createThreadSafeProvider(getToolkit(), HCAC,logger), !doNotRunPhysicalPhasing); // initialize the output VCF header - final VariantAnnotatorEngine annotationEngine = new VariantAnnotatorEngine(Arrays.asList(annotationGroupsToUse), annotationsToUse, annotationsToExclude, this, getToolkit()); + final VariantAnnotatorEngine annotationEngine = new VariantAnnotatorEngine(annotationGroupsToUse, annotationsToUse, annotationsToExclude, this, getToolkit()); final Set headerInfo = new HashSet<>(); diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFs.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFs.java index 658b014aa..3bff97484 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFs.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFs.java @@ -159,7 +159,7 @@ public class GenotypeGVCFs extends RodWalker annotationsToUse = new ArrayList<>(Arrays.asList(new String[]{"InbreedingCoeff", "FisherStrand", "QualByDepth", "ChromosomeCounts", "StrandOddsRatio"})); + protected List annotationsToUse = new ArrayList<>(); /** * Which groups of annotations to add to the output VCF file. The single value 'none' removes the default group. See @@ -168,7 +168,7 @@ public class GenotypeGVCFs extends RodWalker annotationGroupsToUse = new ArrayList<>(Arrays.asList(new String[]{"Standard"})); /** @@ -217,11 +217,13 @@ public class GenotypeGVCFs extends RodWalkeremptyList(), this, toolkit); + annotationEngine = new VariantAnnotatorEngine(annotationGroupsToUse, annotationsToUse, Collections.emptyList(), this, toolkit); + + // create the genotyping engine + boolean doAlleleSpecificGenotyping = annotationsToUse.contains(GATKVCFConstants.AS_QUAL_BY_DEPTH_KEY) || annotationGroupsToUse.contains("AS_Standard"); + genotypingEngine = new UnifiedGenotypingEngine(createUAC(), samples, toolkit.getGenomeLocParser(), GeneralPloidyFailOverAFCalculatorProvider.createThreadSafeProvider(toolkit, genotypeArgs, logger), + toolkit.getArguments().BAQMode, doAlleleSpecificGenotyping); // take care of the VCF headers final Set headerLines = VCFUtils.smartMergeHeaders(vcfRods.values(), true); @@ -341,6 +343,8 @@ public class GenotypeGVCFs extends RodWalker 0 ) { + Assert.assertNotNull(gt.getPL()); + Assert.assertTrue(gt.getPL().length > 0); + for ( int i : gt.getPL() ) { + Assert.assertTrue(i >= 0); + } + Assert.assertNotEquals(Arrays.toString(gt.getPL()),"[0]"); + } + return gt; + } + + private Genotype makeG(String sample, Allele a1, Allele a2, int... pls) { + return new GenotypeBuilder(sample, Arrays.asList(a1, a2)).PL(pls).make(); + } + + private VariantContext makeVC(String source, List alleles, Genotype... genotypes) { + int start = 10; + int stop = start; // alleles.contains(ATC) ? start + 3 : start; + return new VariantContextBuilder(source, "1", start, stop, alleles) + .genotypes(Arrays.asList(genotypes)) + .filters((String)null) + .make(); + } + + @Test + public void testInbreedingCoeffForMultiallelicVC() { + //make sure that compound gets (with no ref) don't add to het count + VariantContext test1 = makeVC("1",Arrays.asList(Aref,T,C), + makeG("s1", Aref, T, 2530, 0, 7099, 366, 3056, 14931), + makeG("s2", T, T, 7099, 2530, 0, 7099, 366, 3056), + makeG("s3", T, C, 7099, 2530, 7099, 3056, 0, 14931), + makeG("s4", Aref, T, 2530, 0, 7099, 366, 3056, 14931), + makeG("s5", T, T, 7099, 2530, 0, 7099, 366, 3056), + makeG("s6", Aref, T, 2530, 0, 7099, 366, 3056, 14931), + makeG("s7", T, T, 7099, 2530, 0, 7099, 366, 3056), + makeG("s8", Aref, T, 2530, 0, 7099, 366, 3056, 14931), + makeG("s9", T, T, 7099, 2530, 0, 7099, 366, 3056), + makeG("s10", Aref, T, 2530, 0, 7099, 366, 3056, 14931)); + + final AS_InbreedingCoeff testClass1 = new AS_InbreedingCoeff(); + testClass1.initialize(null, null, null); + final double ICresult1 = testClass1.calculateIC(test1, T); + Assert.assertEquals(ICresult1, -0.4285714, DELTA_PRECISION, "Pass"); + final double ICresult1b = testClass1.calculateIC(test1, C); + Assert.assertEquals(ICresult1b, -0.05263, DELTA_PRECISION, "Pass"); + + //make sure that hets with different alternate alleles all get counted + VariantContext test2 = makeVC("2", Arrays.asList(Aref,T,C), + makeG("s1", Aref, C, 4878, 1623, 11297, 0, 7970, 8847), + makeG("s2", Aref, T, 2530, 0, 7099, 366, 3056, 14931), + makeG("s3", Aref, T, 3382, 0, 6364, 1817, 5867, 12246), + makeG("s4", Aref, T, 2488, 0, 9110, 3131, 9374, 12505), + makeG("s5", Aref, C, 4530, 2006, 18875, 0, 6847, 23949), + makeG("s6", Aref, T, 5325, 0, 18692, 389, 16014, 24570), + makeG("s7", Aref, T, 2936, 0, 29743, 499, 21979, 38630), + makeG("s8", Aref, T, 6902, 0, 8976, 45, 5844, 9061), + makeG("s9", Aref, T, 5732, 0, 10876, 6394, 11408, 17802), + makeG("s10", Aref, T, 2780, 0, 25045, 824, 23330, 30939)); + + final AS_InbreedingCoeff testClass2 = new AS_InbreedingCoeff(); + testClass2.initialize(null, null, null); + final double ICresult2 = testClass2.calculateIC(test2, T); + Assert.assertEquals(ICresult2, -0.666666, DELTA_PRECISION, "Pass"); + final double ICresult2b = testClass2.calculateIC(test2, C); + Assert.assertEquals(ICresult2b, -0.111129, DELTA_PRECISION, "Pass"); + } + + @Test + public void testSingletonVsCommonAllele() { + + final List allGTs = new ArrayList<>(); + final int numHomRefGTs = 10000; + allGTs.add(makeG("het0", Aref, T, hetPLs)); + + for ( int i = 0; i < numHomRefGTs; i++ ) + allGTs.add(makeG("ref" + i, Aref, Aref, homRefPLs)); + + int numHetGTs = 1; + + final VariantContext singleton = makeVC("singleton", Arrays.asList(Aref, T), allGTs.toArray(new Genotype[allGTs.size()])); + AS_InbreedingCoeff testClass = new AS_InbreedingCoeff(); + testClass.initialize(null,null,null); + final double ICsingleton = testClass.calculateIC(singleton, T); + + final int targetNumHetGTs = 20; + for ( int i = numHetGTs; i < targetNumHetGTs; i++ ) + allGTs.add(makeG("het" + i, Aref, T, hetPLs)); + + final VariantContext common = makeVC("common", Arrays.asList(Aref, T), allGTs.toArray(new Genotype[allGTs.size()])); + testClass = new AS_InbreedingCoeff(); + testClass.initialize(null,null,null); + final double ICcommon = testClass.calculateIC(common, T); + + Assert.assertTrue(Math.abs(ICsingleton) < Math.abs(ICcommon), String.format("singleton=%f common=%f", ICsingleton, ICcommon)); + } + + @Test + public void testLargeCohorts() { + + final List allGTs = new ArrayList<>(); + final int numHomRefGTs = 1000000; + for ( int i = 0; i < numHomRefGTs; i++ ) + allGTs.add(makeG("ref" + i, Aref, Aref, homRefPLs)); + + allGTs.add(makeG("het0", Aref, T, hetPLs)); + int numHetGTs = 1; + + final VariantContext singleton = makeVC("singleton", Arrays.asList(Aref, T), allGTs.toArray(new Genotype[allGTs.size()])); + AS_InbreedingCoeff testClass = new AS_InbreedingCoeff(); + testClass.initialize(null,null,null); + final double ICsingleton = testClass.calculateIC(singleton, T); + + for ( int i = numHetGTs; i < 100; i++ ) { + allGTs.add(makeG("het" + i, Aref, T, hetPLs)); + numHetGTs++; + } + + final VariantContext hundredton = makeVC("hundredton", Arrays.asList(Aref, T), allGTs.toArray(new Genotype[allGTs.size()])); + testClass = new AS_InbreedingCoeff(); + testClass.initialize(null,null,null); + final double IChundredton = testClass.calculateIC(hundredton, T); + + Assert.assertTrue(Math.abs(ICsingleton) < Math.abs(IChundredton), String.format("singleton=%f hundredton=%f", ICsingleton, IChundredton)); + + for ( int i = numHetGTs; i < numHomRefGTs; i++ ) + allGTs.add(makeG("het" + i, Aref, T, hetPLs)); + + final VariantContext common = makeVC("common", Arrays.asList(Aref, T), allGTs.toArray(new Genotype[allGTs.size()])); + testClass = new AS_InbreedingCoeff(); + testClass.initialize(null,null,null); + final double ICcommon = testClass.calculateIC(common, T); + + Assert.assertTrue(Math.abs(IChundredton) < Math.abs(ICcommon), String.format("hundredton=%f common=%f", IChundredton, ICcommon)); + } + + @Test + public void testAllHetsForLargeCohorts() { + + final int numGTs = 1000000; + + final List singletonGTs = new ArrayList<>(); + for ( int i = 0; i < numGTs; i++ ) + singletonGTs.add(makeG("ref" + i, Aref, Aref, homRefPLs)); + + singletonGTs.add(makeG("het0", Aref, T, hetPLs)); + + final VariantContext singleton = makeVC("singleton", Arrays.asList(Aref, T), singletonGTs.toArray(new Genotype[singletonGTs.size()])); + AS_InbreedingCoeff testClass = new AS_InbreedingCoeff(); + testClass.initialize(null, null, null); + final double ICsingleton = testClass.calculateIC(singleton, T); + + final List allHetGTs = new ArrayList<>(); + for ( int i = 0; i < numGTs; i++ ) + allHetGTs.add(makeG("het" + i, Aref, T, hetPLs)); + + final VariantContext allHet = makeVC("allHet", Arrays.asList(Aref, T), allHetGTs.toArray(new Genotype[allHetGTs.size()])); + testClass.initialize(null, null, null); + final double ICHets = testClass.calculateIC(allHet, T); + + Assert.assertTrue(Math.abs(ICsingleton) < Math.abs(ICHets), String.format("singleton=%f allHets=%f", ICsingleton, ICHets)); + } +} + diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/annotator/ExcessHetUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/annotator/ExcessHetUnitTest.java index 563a20a6b..8b5194485 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/annotator/ExcessHetUnitTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/annotator/ExcessHetUnitTest.java @@ -242,7 +242,7 @@ public class ExcessHetUnitTest { @Test(dataProvider = "smallSets") public void smallSets(int hetCount, int homrefCount, int homvarCount, double expected) { - double actual = new ExcessHet().exactTest(hetCount, homrefCount, homvarCount); + double actual = new ExcessHet().exactTest(new int[]{homrefCount, hetCount, homvarCount}); Assert.assertEquals(actual, expected, DELTA_PRECISION, "Pass"); } } diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/annotator/InbreedingCoeffUnitTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/annotator/InbreedingCoeffUnitTest.java index a9f778aa4..f23079bfe 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/annotator/InbreedingCoeffUnitTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/annotator/InbreedingCoeffUnitTest.java @@ -124,7 +124,10 @@ public class InbreedingCoeffUnitTest { makeG("s9", T, T, 7099, 2530, 0, 7099, 366, 3056), makeG("s10", Aref, T, 2530, 0, 7099, 366, 3056, 14931)); - final double ICresult1 = new InbreedingCoeff().calculateIC(test1, test1.getGenotypes()); + InbreedingCoeff testClass = new InbreedingCoeff(); + //Since we're calling this from outside the AnnotationEngine, the InbreedingCoeff has to be initialized so it can store genotype counts + testClass.initialize(null, null, null); + final double ICresult1 = testClass.calculateIC(test1, test1.getGenotypes()); Assert.assertEquals(ICresult1, -0.3333333, DELTA_PRECISION, "Pass"); //make sure that hets with different alternate alleles all get counted @@ -140,7 +143,8 @@ public class InbreedingCoeffUnitTest { makeG("s9", Aref, T, 5732, 0, 10876, 6394, 11408, 17802), makeG("s10", Aref, T, 2780, 0, 25045, 824, 23330, 30939)); - final double ICresult2 = new InbreedingCoeff().calculateIC(test2, test2.getGenotypes()); + testClass.initialize(null, null, null); + final double ICresult2 = testClass.calculateIC(test2, test2.getGenotypes()); Assert.assertEquals(ICresult2, -1.0, DELTA_PRECISION, "Pass"); } @@ -156,14 +160,17 @@ public class InbreedingCoeffUnitTest { int numHetGTs = 1; final VariantContext singleton = makeVC("singleton", Arrays.asList(Aref, T), allGTs.toArray(new Genotype[allGTs.size()])); - final double ICsingleton = new InbreedingCoeff().calculateIC(singleton, singleton.getGenotypes()); + InbreedingCoeff testClass = new InbreedingCoeff(); + testClass.initialize(null, null, null); + final double ICsingleton = testClass.calculateIC(singleton, singleton.getGenotypes()); final int targetNumHetGTs = 20; for ( int i = numHetGTs; i < targetNumHetGTs; i++ ) allGTs.add(makeG("het" + i, Aref, T, hetPLs)); final VariantContext common = makeVC("common", Arrays.asList(Aref, T), allGTs.toArray(new Genotype[allGTs.size()])); - final double ICcommon = new InbreedingCoeff().calculateIC(common, common.getGenotypes()); + testClass.initialize(null, null, null); + final double ICcommon = testClass.calculateIC(common, common.getGenotypes()); Assert.assertTrue(Math.abs(ICsingleton) < Math.abs(ICcommon), String.format("singleton=%f common=%f", ICsingleton, ICcommon)); } @@ -180,7 +187,9 @@ public class InbreedingCoeffUnitTest { int numHetGTs = 1; final VariantContext singleton = makeVC("singleton", Arrays.asList(Aref, T), allGTs.toArray(new Genotype[allGTs.size()])); - final double ICsingleton = new InbreedingCoeff().calculateIC(singleton, singleton.getGenotypes()); + InbreedingCoeff testClass = new InbreedingCoeff(); + testClass.initialize(null, null, null); + final double ICsingleton = testClass.calculateIC(singleton, singleton.getGenotypes()); for ( int i = numHetGTs; i < 100; i++ ) { allGTs.add(makeG("het" + i, Aref, T, hetPLs)); @@ -188,7 +197,8 @@ public class InbreedingCoeffUnitTest { } final VariantContext hundredton = makeVC("hundredton", Arrays.asList(Aref, T), allGTs.toArray(new Genotype[allGTs.size()])); - final double IChundredton = new InbreedingCoeff().calculateIC(hundredton, hundredton.getGenotypes()); + testClass.initialize(null, null, null); + final double IChundredton = testClass.calculateIC(hundredton, hundredton.getGenotypes()); Assert.assertTrue(Math.abs(ICsingleton) < Math.abs(IChundredton), String.format("singleton=%f hundredton=%f", ICsingleton, IChundredton)); @@ -196,7 +206,8 @@ public class InbreedingCoeffUnitTest { allGTs.add(makeG("het" + i, Aref, T, hetPLs)); final VariantContext common = makeVC("common", Arrays.asList(Aref, T), allGTs.toArray(new Genotype[allGTs.size()])); - final double ICcommon = new InbreedingCoeff().calculateIC(common, common.getGenotypes()); + testClass.initialize(null, null, null); + final double ICcommon = testClass.calculateIC(common, common.getGenotypes()); Assert.assertTrue(Math.abs(IChundredton) < Math.abs(ICcommon), String.format("hundredton=%f common=%f", IChundredton, ICcommon)); } @@ -213,14 +224,17 @@ public class InbreedingCoeffUnitTest { singletonGTs.add(makeG("het0", Aref, T, hetPLs)); final VariantContext singleton = makeVC("singleton", Arrays.asList(Aref, T), singletonGTs.toArray(new Genotype[singletonGTs.size()])); - final double ICsingleton = new InbreedingCoeff().calculateIC(singleton, singleton.getGenotypes()); + InbreedingCoeff testClass = new InbreedingCoeff(); + testClass.initialize(null, null, null); + final double ICsingleton = testClass.calculateIC(singleton, singleton.getGenotypes()); final List allHetGTs = new ArrayList<>(); for ( int i = 0; i < numGTs; i++ ) allHetGTs.add(makeG("het" + i, Aref, T, hetPLs)); final VariantContext allHet = makeVC("allHet", Arrays.asList(Aref, T), allHetGTs.toArray(new Genotype[allHetGTs.size()])); - final double ICHets = new InbreedingCoeff().calculateIC(allHet, allHet.getGenotypes()); + testClass.initialize(null, null, null); + final double ICHets = testClass.calculateIC(allHet, allHet.getGenotypes()); Assert.assertTrue(Math.abs(ICsingleton) < Math.abs(ICHets), String.format("singleton=%f allHets=%f", ICsingleton, ICHets)); } diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java index f0b36b9ba..f14fda293 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest.java @@ -72,7 +72,7 @@ public class HaplotypeCallerComplexAndSymbolicVariantsIntegrationTest extends Wa @Test public void testHaplotypeCallerMultiSampleComplex1() { - HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "c652334e4b9e422d3a96f2f4d221a163"); + HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "066f1ce9e9826bcfedf6cd80bc560ab8"); } private void HCTestSymbolicVariants(String bam, String args, String md5) { diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java index ba7676654..20b089630 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGVCFIntegrationTest.java @@ -320,7 +320,7 @@ public class HaplotypeCallerGVCFIntegrationTest extends WalkerTest { public void testAlleleSpecificAnnotations() { final String commandLine = String.format("-T HaplotypeCaller --pcr_indel_model NONE -pairHMMSub %s %s -R %s -I %s -L %s -ERC GVCF --no_cmdline_in_header -variant_index_type %s -variant_index_parameter %d -G Standard -G AS_Standard --disableDithering", HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, b37KGReference, privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.bam", "20:10433000-10437000", GATKVCFUtils.DEFAULT_GVCF_INDEX_TYPE, GATKVCFUtils.DEFAULT_GVCF_INDEX_PARAMETER); - final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("fc10e56ffa5e230c1a1c8bdf9673bdaa")); + final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("716db262a288eb2a477df3f1957372c7")); spec.disableShadowBCF(); executeTest(" testAlleleSpecificAnnotations", spec); } diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index 418bdf778..4ac07e377 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -373,7 +373,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { public void testLackSensitivityDueToBadHaplotypeSelectionFix() { final String commandLine = String.format("-T HaplotypeCaller -pairHMMSub %s %s -R %s -I %s -L %s --no_cmdline_in_header --maxNumHaplotypesInPopulation 16", HMM_SUB_IMPLEMENTATION, ALWAYS_LOAD_VECTOR_HMM, b37KGReferenceWithDecoy, privateTestDir + "hc-lack-sensitivity.bam", privateTestDir + "hc-lack-sensitivity.interval_list"); - final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("9dbc1f3e79362f36ddd6c2830791a2bf")); + final WalkerTestSpec spec = new WalkerTestSpec(commandLine + " -o %s", Arrays.asList("02ec3090c4a6359fa10e6d8b30e1d5a2")); spec.disableShadowBCF(); executeTest("testLackSensitivityDueToBadHaplotypeSelectionFix", spec); } @@ -459,12 +459,12 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerTandemRepeatAnnotator() throws IOException{ - HCTest(NA12878_BAM, " -L 20:10001000-10010000 -A TandemRepeatAnnotator -XA MappingQualityZero -XA SpanningDeletions", "2c108ecdc73e1158310dcdab1abf3e66"); + HCTest(NA12878_BAM, " -L 20:10001000-10010000 -A TandemRepeatAnnotator -XA MappingQualityZero -XA SpanningDeletions", "03738462e7f0b6f149f40b790a3a7261"); } @Test public void testHBaseCountsBySample() throws IOException{ - HCTest(NA12878_BAM, " -L 20:10001000-10010000 -A BaseCountsBySample", "f55b64e7457ed40dc8d56c499de9d516"); + HCTest(NA12878_BAM, " -L 20:10001000-10010000 -A BaseCountsBySample", "8739d92898113436666f1cff3bc39bc5"); } } diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFsIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFsIntegrationTest.java index 1909060a0..7650ff1ce 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFsIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/variantutils/GenotypeGVCFsIntegrationTest.java @@ -84,7 +84,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( baseTestString(" -V " + privateTestDir + "testUpdatePGT.vcf", b37KGReference), 1, - Arrays.asList("9424ae7832fd53790372578423392468")); + Arrays.asList("beebc536d20d69a45c6f56fbb041c9bc")); executeTest("testUpdatePGT", spec); } @@ -94,7 +94,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( baseTestString(" -V " + privateTestDir + "testUpdatePGT.vcf -A StrandAlleleCountsBySample -log " + logFileName, b37KGReference), 1, - Arrays.asList("5deed67f8eb10cbd4429d70e0c26ef7c")); + Arrays.asList("527d513874a787821daf54b8fc8a33e3")); executeTest("testUpdatePGTStrandAlleleCountsBySample", spec); File file = new File(logFileName); @@ -109,7 +109,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { " -V:sample3 " + privateTestDir + "combine.single.sample.pipeline.3.vcf" + " -L 20:10,000,000-20,000,000", b37KGReference), 1, - Arrays.asList("6df12487566b55c4cdbc0993ddf4a75e")); + Arrays.asList("63bdb33fe44b6589adc5c36b0ea740b2")); executeTest("combineSingleSamplePipelineGVCF", spec); } @@ -121,7 +121,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { " -V:sample3 " + privateTestDir + "tetraploid-gvcf-3.vcf" + " -L " + privateTestDir + "tetraploid-gvcfs.intervals", b37KGReference), 1, - Arrays.asList("e0a51868271fb7b399478b9c63ede291")); + Arrays.asList("3708b0d993a683e8c7421f60d7123cf4")); executeTest("combineSingleSamplePipelineGVCF", spec); } @@ -133,7 +133,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { " -V:sample3 " + privateTestDir + "diploid-gvcf-3.vcf" + " -L " + privateTestDir + "tetraploid-gvcfs.intervals", b37KGReference), 1, - Arrays.asList("1e4be55f727c7a9376a7a37406f06cee")); + Arrays.asList("7d7a65ea549fcd30553766ad4333f9e2")); executeTest("combineSingleSamplePipelineGVCF", spec); } @@ -145,7 +145,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { " -V:sample3 " + privateTestDir + "combine.single.sample.pipeline.3.vcf" + " --includeNonVariantSites -L 20:10,030,000-10,033,000 -L 20:10,386,000-10,386,500", b37KGReference), 1, - Arrays.asList("13c24e54c17e667a9647066aed54da76")); + Arrays.asList("8b338d065806f7c7eea67f56a1f6009e")); executeTest("combineSingleSamplePipelineGVCF_includeNonVariants", spec); } @@ -158,7 +158,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { " -V:sample3 " + privateTestDir + "combine.single.sample.pipeline.3.vcf" + " -L 20:10,000,000-20,000,000", b37KGReference), 1, - Arrays.asList("95e21516ef7a71de9b299d502e83268e")); + Arrays.asList("2d4e6a3193c493514576a758e891b951")); executeTest("combineSingleSamplePipelineGVCFHierarchical", spec); } @@ -170,7 +170,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { " -V:sample3 " + privateTestDir + "combine.single.sample.pipeline.3.vcf" + " -L 20:10,000,000-11,000,000 --dbsnp " + b37dbSNP132, b37KGReference), 1, - Arrays.asList("f352d0204a37a7adc4000318d5f24cb8")); + Arrays.asList("7693207e925359df331e64664c5b8763")); executeTest("combineSingleSamplePipelineGVCF_addDbsnp", spec); } @@ -180,7 +180,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { "-T GenotypeGVCFs --no_cmdline_in_header -L 1:69485-69791 -o %s -R " + b37KGReference + " -V " + privateTestDir + "gvcfExample1.vcf", 1, - Arrays.asList("88ef27a16fc44a3caa700677d5b78df5")); + Arrays.asList("84ad9c6e7582dbcc693deacdeff5984a")); executeTest("testJustOneSample", spec); } @@ -191,14 +191,14 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { " -V " + privateTestDir + "gvcfExample1.vcf" + " -V " + privateTestDir + "gvcfExample2.vcf", 1, - Arrays.asList("45d9c291a74897cf3c3aaf52af5a08f8")); + Arrays.asList("54b76f721811c9c7958e849c40b8d4e2")); executeTest("testSamplesWithDifferentLs", spec); } @Test(enabled = true) public void testNoPLsException() { // Test with input files with (1) 0/0 and (2) ./. - final String md5 = "2dd285987047f37c730060734ca76038"; + final String md5 = "276159213ddaaf82cd0e640cc7a77fc4"; WalkerTestSpec spec1 = new WalkerTestSpec( "-T GenotypeGVCFs --no_cmdline_in_header -L 1:1115550-1115551 -o %s -R " + hg19Reference + " --variant " + privateTestDir + "combined_genotype_gvcf_exception.vcf", @@ -218,7 +218,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( baseBPResolutionString("-nda"), 1, - Arrays.asList("b7fda2981a2367764b83df7aa3b3b3e7")); + Arrays.asList("3c9c84b78e7d3b358c8cb7e29a2d302b")); executeTest("testNDA", spec); } @@ -227,7 +227,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( baseBPResolutionString("-maxAltAlleles 1"), 1, - Arrays.asList("0c26cabfcebfb0b662185b233e0cd976")); + Arrays.asList("87ed70b8f910b662aa67e8ed1b2ed174")); executeTest("testMaxAltAlleles", spec); } @@ -236,7 +236,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( baseBPResolutionString("-stand_call_conf 300 -stand_emit_conf 100"), 1, - Arrays.asList("102c33e8b2a7200972d58fc6bae8bd3b")); + Arrays.asList("1d98fb542a39090db3a8f89ae232e1e5")); executeTest("testStandardConf", spec); } @@ -259,7 +259,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { final WalkerTestSpec spec = new WalkerTestSpec( baseTestString(" -V " + gVCF.getAbsolutePath(), b37KGReference), 1, - Arrays.asList("81ae1bc09f9cd34b0b37ae3cb4b09bc8")); + Arrays.asList("eeff965cd79c0b7085c7d4d7ecf82b68")); spec.disableShadowBCF(); //TODO: Remove when BaseTest.assertAttributesEquals() works with SAC executeTest("testStrandAlleleCountsBySample", spec); } @@ -276,7 +276,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { " -V:combined2 " + privateTestDir + "combine.single.sample.pipeline.combined.vcf" + " --uniquifySamples", b37KGReference), 1, - Arrays.asList("1f6321464371ecffb06108fb86ffab49")); + Arrays.asList("b73f5bf5646695ca019d84d44c74c819")); executeTest("testUniquifiedSamples", spec); } @@ -448,7 +448,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { } - private static final String simpleSpanningDeletionsMD5 = "9ccf64721dba002bef08046f02eb5ee9"; + private static final String simpleSpanningDeletionsMD5 = "85c14341171548997e4503f7b5a9253f"; @Test(enabled = true) public void testSpanningDeletionsMD5() { @@ -478,7 +478,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { "-T GenotypeGVCFs --no_cmdline_in_header -o %s -R " + b37KGReference + " -V " + privateTestDir + "spanningDel.1.g.vcf -V " + privateTestDir + "spanningDel.2.g.vcf -V " + privateTestDir + "spanningDel.3.g.vcf", 1, - Arrays.asList("80de258cf5c73e0805e15347ef82d4be")); + Arrays.asList("6c5761ffb7a0c5252f3f5048d52f500e")); spec.disableShadowBCF(); executeTest("testMultipleSpanningDeletionsMD5", spec); } @@ -489,7 +489,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { "-T GenotypeGVCFs --no_cmdline_in_header -o %s -R " + b37KGReference + " -V " + privateTestDir + "spanningDel.delOnly.g.vcf", 1, - Arrays.asList("b16ae4b8e87bc87cd689929c28aff3de")); + Arrays.asList("c8414446dbac9a3639bfc2f347cc2c1d")); spec.disableShadowBCF(); executeTest("testSpanningDeletionDoesNotGetGenotypedWithNoOtherAlleles", spec); } @@ -500,7 +500,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { "-T GenotypeGVCFs --no_cmdline_in_header -o %s -R " + b37KGReference + " -V " + privateTestDir + "spanningDel.depr.delOnly.g.vcf", 1, - Arrays.asList("586e814c0578e245352cbb4647b5a46a")); + Arrays.asList("d1d8c3db65905b4ef79f960f9565ca94")); spec.disableShadowBCF(); executeTest("testSpanningDeletionDoesNotGetGenotypedWithNoOtherAlleles", spec); } @@ -523,7 +523,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { "-T GenotypeGVCFs --no_cmdline_in_header -o %s -R " + b37KGReference + " -V " + privateTestDir + "ad-bug-input.vcf", 1, - Arrays.asList("027f96584e91ca8255764fbf38293963")); + Arrays.asList("a8dcb9024e3701449ec2a1fe75e0d057")); spec.disableShadowBCF(); executeTest("testBadADPropagationHaploidBugTest", spec); } @@ -534,7 +534,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { "-T GenotypeGVCFs --no_cmdline_in_header -o %s -R " + b37KGReference + " -V " + privateTestDir + "261_S01_raw_variants_gvcf.vcf", 1, - Arrays.asList("09da4a0fb937efab228413d1162fde2d")); + Arrays.asList("01a9eee63801d46de8fcf1d6f80f8359")); spec.disableShadowBCF(); executeTest("testSAC", spec); } @@ -545,7 +545,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { "-T GenotypeGVCFs --no_cmdline_in_header -o %s -R " + b37KGReference + " -V " + privateTestDir + "tetraploid-multisample-sac.g.vcf", 1, - Arrays.asList("a8b7b300d6d5345b7b02b86d75671756")); + Arrays.asList("8c79a16f6a524d49ff402b8c0b39b396")); spec.disableShadowBCF(); executeTest("testSACMultisampleTetraploid", spec); } @@ -557,7 +557,7 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { " -V " + privateTestDir + "set.zero.RGQs.no.call.sample2.g.vcf" + " -L chr16:1279274-1279874 -allSites", hg19ReferenceWithChrPrefixInChromosomeNames), 1, - Arrays.asList("d617884b08ee85816f1ba1acf11f1738")); + Arrays.asList("75f6402da0f6b8b4e69c847fe8b5179a")); executeTest("testSetZeroRGQsToNoCall", spec); } @@ -565,18 +565,30 @@ public class GenotypeGVCFsIntegrationTest extends WalkerTest { public void testAlleleSpecificAnnotations() { final String cmd = "-T GenotypeGVCFs -R " + b37KGReference + " -o %s --no_cmdline_in_header -G Standard -G AS_Standard --disableDithering -V " + privateTestDir + "NA12878.AS.chr20snippet.g.vcf -V " + privateTestDir + "NA12891.AS.chr20snippet.g.vcf"; - final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("f62ae282ec8e32f6104ef84237b8a5a4")); + final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("35daaea8dea591d35ca99854c8d36e5f")); spec.disableShadowBCF(); executeTest("testAlleleSpecificAnnotations", spec); } @Test //make sure none of the assumptions about things being merged as lists break the single-sample case + //This test file also doesn't have raw data, so test to make sure that doesn't make GenotypeGVCFs crash and burn + //Note that AS_InbreedingCoeff and InbreedingCoeff may still differ for bialleleic sites for low number of samples because allele frequencies are derived differently public void testAlleleSpecificAnnotations_oneSample() { final String cmd = "-T GenotypeGVCFs -R " + b37KGReference + " -o %s --no_cmdline_in_header -G Standard -G AS_Standard --disableDithering -V " + privateTestDir + "NA12878.AS.chr20snippet.g.vcf"; - final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("1eefb0ed407b9071f09f9189c9ad45cf")); + final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("b6026b2a2d2da39f181a4905b2225dad")); spec.disableShadowBCF(); - executeTest("testAlleleSpecificAnnotations", spec); + executeTest("testAlleleSpecificAnnotations_oneSample", spec); + } + + @Test + //do at least 10 samples so InbreedingCoeff and AS_InbreedingCoeff are output + public void testAlleleSpecificAnnotations_elevenSamples() { + final String cmd = "-T GenotypeGVCFs -R " + b37KGReference + " -o %s --no_cmdline_in_header -G Standard -G AS_Standard --disableDithering -V " + + privateTestDir + "multiSamples.g.vcf"; + final WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, Arrays.asList("84b5723c9c8eeb5549aaceb4fd4053b5")); + spec.disableShadowBCF(); + executeTest("testAlleleSpecificAnnotations_elevenSamples", spec); } } \ No newline at end of file diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/VariantAnnotatorEngine.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/VariantAnnotatorEngine.java index 840ecd808..4af58114a 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/VariantAnnotatorEngine.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/VariantAnnotatorEngine.java @@ -126,6 +126,8 @@ public class VariantAnnotatorEngine { protected List getRequestedExpressions() { return requestedExpressions; } + public List getRequestedReducibleInfoAnnotations() { return Collections.unmodifiableList(requestedReducibleInfoAnnotations); } + private void initializeAnnotations(List annotationGroupsToUse, List annotationsToUse, List annotationsToExclude) { AnnotationInterfaceManager.validateAnnotations(annotationGroupsToUse, annotationsToUse); requestedInfoAnnotations = AnnotationInterfaceManager.createInfoFieldAnnotations(annotationGroupsToUse, annotationsToUse); @@ -349,10 +351,7 @@ public class VariantAnnotatorEngine { // go through all the requested info annotationTypes for ( final InfoFieldAnnotation annotationType : requestedReducibleInfoAnnotations ) { - if ( !(annotationType instanceof ActiveRegionBasedAnnotation) ) - continue; - - ReducibleAnnotation currentASannotation = (ReducibleAnnotation)annotationType; + ReducibleAnnotation currentASannotation = (ReducibleAnnotation)annotationType; final Map annotationsFromCurrentType = currentASannotation.finalizeRawData(vc, originalVC); if ( annotationsFromCurrentType != null ) { infoAnnotations.putAll(annotationsFromCurrentType); diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVCFConstants.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVCFConstants.java index f871bdc1b..4947d6195 100644 --- a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVCFConstants.java +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVCFConstants.java @@ -73,6 +73,7 @@ public final class GATKVCFConstants { public static final String AVG_INTERVAL_DP_KEY = "IDP"; //DiagnoseTargets public static final String INTERVAL_GC_CONTENT_KEY = "IGC"; public static final String INBREEDING_COEFFICIENT_KEY = "InbreedingCoeff"; + public static final String AS_INBREEDING_COEFFICIENT_KEY = "AS_InbreedingCoeff"; public static final String EXCESS_HET_KEY = "ExcessHet"; public static final String AS_HETEROZYGOSITY_KEY = "AS_InbreedingCoeff"; public static final String LIKELIHOOD_RANK_SUM_KEY = "LikelihoodRankSum"; @@ -102,6 +103,8 @@ public final class GATKVCFConstants { public static final String PANEL_OF_NORMALS_COUNT_KEY = "PON"; //M2 public static final String POSITIVE_LABEL_KEY = "POSITIVE_TRAIN_SITE"; public static final String QUAL_BY_DEPTH_KEY = "QD"; + public static final String AS_QUAL_BY_DEPTH_KEY = "AS_QD"; + public static final String AS_QUAL_KEY = "AS_QUAL"; public static final String BEAGLE_R2_KEY = "R2"; //BeagleOutputToVCF public static final String AS_READ_POS_RANK_SUM_KEY = "AS_ReadPosRankSum"; public static final String AS_RAW_READ_POS_RANK_SUM_KEY = "AS_RAW_ReadPosRankSum"; diff --git a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVCFHeaderLines.java b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVCFHeaderLines.java index 634da68f8..70b7108c8 100644 --- a/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVCFHeaderLines.java +++ b/public/gatk-utils/src/main/java/org/broadinstitute/gatk/utils/variant/GATKVCFHeaderLines.java @@ -129,6 +129,7 @@ public class GATKVCFHeaderLines { addInfoLine(new VCFInfoHeaderLine(HARDY_WEINBERG_KEY, 1, VCFHeaderLineType.Float, "Phred-scaled p-value for Hardy-Weinberg violation")); addInfoLine(new VCFInfoHeaderLine(HOMOPOLYMER_RUN_KEY, 1, VCFHeaderLineType.Integer, "Largest Contiguous Homopolymer Run of Variant Allele In Either Direction")); addInfoLine(new VCFInfoHeaderLine(INBREEDING_COEFFICIENT_KEY, 1, VCFHeaderLineType.Float, "Inbreeding coefficient as estimated from the genotype likelihoods per-sample when compared against the Hardy-Weinberg expectation")); + addInfoLine(new VCFInfoHeaderLine(AS_INBREEDING_COEFFICIENT_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Float, "Allele-specific inbreeding coefficient as estimated from the genotype likelihoods per-sample when compared against the Hardy-Weinberg expectation")); addInfoLine(new VCFInfoHeaderLine(EXCESS_HET_KEY, 1, VCFHeaderLineType.Float, "Phred-scaled p-value for exact test of excess heterozygosity")); addInfoLine(new VCFInfoHeaderLine(AS_HETEROZYGOSITY_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Float, "allele specific heterozygosity as estimated from the genotype likelihoods per-sample when compared against the Hardy-Weinberg expectation; relate to inbreeding coefficient")); addInfoLine(new VCFInfoHeaderLine(LIKELIHOOD_RANK_SUM_KEY, 1, VCFHeaderLineType.Float, "Z-score from Wilcoxon rank sum test of Alt Vs. Ref haplotype likelihoods")); @@ -145,6 +146,8 @@ public class GATKVCFHeaderLines { addInfoLine(new VCFInfoHeaderLine(HI_CONF_DENOVO_KEY, 1, VCFHeaderLineType.String, "High confidence possible de novo mutation (GQ >= 20 for all trio members)=[comma-delimited list of child samples]")); addInfoLine(new VCFInfoHeaderLine(LO_CONF_DENOVO_KEY, 1, VCFHeaderLineType.String, "Low confidence possible de novo mutation (GQ >= 10 for child, GQ > 0 for parents)=[comma-delimited list of child samples]")); addInfoLine(new VCFInfoHeaderLine(QUAL_BY_DEPTH_KEY, 1, VCFHeaderLineType.Float, "Variant Confidence/Quality by Depth")); + addInfoLine(new VCFInfoHeaderLine(AS_QUAL_BY_DEPTH_KEY, 1, VCFHeaderLineType.Float, "Allele-specific Variant Confidence/Quality by Depth")); + addInfoLine(new VCFInfoHeaderLine(AS_QUAL_KEY, 1, VCFHeaderLineType.Float, "Allele-specific Variant Qual Score")); addInfoLine(new VCFInfoHeaderLine(READ_POS_RANK_SUM_KEY, 1, VCFHeaderLineType.Float, "Z-score from Wilcoxon rank sum test of Alt vs. Ref read position bias")); addInfoLine(new VCFInfoHeaderLine(AS_READ_POS_RANK_SUM_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Float, "allele specific Z-score from Wilcoxon rank sum test of each Alt vs. Ref read position bias")); addInfoLine(new VCFInfoHeaderLine(AS_RAW_READ_POS_RANK_SUM_KEY, 1, VCFHeaderLineType.String, "allele specific raw data for rank sum test of read position bias"));