diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_BaseQualityRankSumTest.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_BaseQualityRankSumTest.java
new file mode 100644
index 000000000..10439d918
--- /dev/null
+++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_BaseQualityRankSumTest.java
@@ -0,0 +1,99 @@
+/*
+* By downloading the PROGRAM you agree to the following terms of use:
+*
+* BROAD INSTITUTE
+* SOFTWARE LICENSE AGREEMENT
+* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
+*
+* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”).
+*
+* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
+* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
+* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
+*
+* 1. DEFINITIONS
+* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE.
+*
+* 2. LICENSE
+* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation.
+* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
+* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
+* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
+*
+* 3. PHONE-HOME FEATURE
+* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation.
+*
+* 4. OWNERSHIP OF INTELLECTUAL PROPERTY
+* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
+* Copyright 2012-2015 Broad Institute, Inc.
+* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
+* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
+*
+* 5. INDEMNIFICATION
+* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
+*
+* 6. NO REPRESENTATIONS OR WARRANTIES
+* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
+* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
+*
+* 7. ASSIGNMENT
+* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
+*
+* 8. MISCELLANEOUS
+* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
+* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
+* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
+* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
+* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
+* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
+* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
+*/
+
+package org.broadinstitute.gatk.tools.walkers.annotator;
+
+import htsjdk.variant.vcf.VCFInfoHeaderLine;
+import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AS_StandardAnnotation;
+import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.StandardAnnotation;
+import org.broadinstitute.gatk.utils.sam.GATKSAMRecord;
+import org.broadinstitute.gatk.utils.sam.ReadUtils;
+import org.broadinstitute.gatk.utils.variant.GATKVCFConstants;
+import org.broadinstitute.gatk.utils.variant.GATKVCFHeaderLines;
+
+import java.util.Arrays;
+import java.util.List;
+
+
+/**
+ * Allele-specific rank Sum Test of REF versus each ALT base quality scores
+ *
+ *
This variant-level annotation tests compares the base qualities of the data supporting the reference allele with those supporting each alternate allele. The ideal result is a value close to zero, which indicates there is little to no difference. A negative value indicates that the bases supporting the alternate allele have lower quality scores than those supporting the reference allele. Conversely, a positive value indicates that the bases supporting the alternate allele have higher quality scores than those supporting the reference allele. Finding a statistically significant difference either way suggests that the sequencing process may have been biased or affected by an artifact.
+ *
+ *
Statistical notes
+ *
The value output for this annotation is the u-based z-approximation from the Mann-Whitney-Wilcoxon Rank Sum Test for base qualities (bases supporting REF vs. bases supporting ALT). See the method document on statistical tests for a more detailed explanation of the ranksum test.
+ *
+ *
Caveat
+ *
Uninformative reads are not used in these calculations.
+ *
+ */
+public class AS_BaseQualityRankSumTest extends AS_RankSumTest implements AS_StandardAnnotation {
+ @Override
+ public List getKeyNames() {
+ return Arrays.asList(GATKVCFConstants.AS_BASE_QUAL_RANK_SUM_KEY);
+ }
+
+ @Override
+ public String getRawKeyName() { return GATKVCFConstants.AS_RAW_BASE_QUAL_RANK_SUM_KEY;}
+
+ /**
+ * Get the element for the given read at the given reference position
+ *
+ * @param read the read
+ * @param refLoc the reference position
+ * @return a Double representing the element to be used in the rank sum test, or null if it should not be used
+ */
+ @Override
+ protected Double getElementForRead(final GATKSAMRecord read, final int refLoc) {
+ return (double) read.getBaseQualities()[ReadUtils.getReadCoordinateForReferenceCoordinateUpToEndOfRead(read, refLoc, ReadUtils.ClippingTail.RIGHT_TAIL)];
+ }
+
+}
\ No newline at end of file
diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_FisherStrand.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_FisherStrand.java
new file mode 100644
index 000000000..7a8479594
--- /dev/null
+++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_FisherStrand.java
@@ -0,0 +1,134 @@
+/*
+* By downloading the PROGRAM you agree to the following terms of use:
+*
+* BROAD INSTITUTE
+* SOFTWARE LICENSE AGREEMENT
+* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
+*
+* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”).
+*
+* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
+* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
+* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
+*
+* 1. DEFINITIONS
+* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE.
+*
+* 2. LICENSE
+* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation.
+* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
+* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
+* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
+*
+* 3. PHONE-HOME FEATURE
+* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation.
+*
+* 4. OWNERSHIP OF INTELLECTUAL PROPERTY
+* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
+* Copyright 2012-2015 Broad Institute, Inc.
+* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
+* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
+*
+* 5. INDEMNIFICATION
+* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
+*
+* 6. NO REPRESENTATIONS OR WARRANTIES
+* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
+* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
+*
+* 7. ASSIGNMENT
+* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
+*
+* 8. MISCELLANEOUS
+* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
+* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
+* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
+* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
+* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
+* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
+* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
+*/
+
+package org.broadinstitute.gatk.tools.walkers.annotator;
+
+import htsjdk.variant.variantcontext.Allele;
+import htsjdk.variant.variantcontext.VariantContext;
+import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.*;
+import org.broadinstitute.gatk.utils.QualityUtils;
+import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap;
+import org.broadinstitute.gatk.utils.variant.GATKVCFConstants;
+
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+
+/**
+ * Allele specific strand bias estimated using Fisher's Exact Test
+ *
+ */
+public class AS_FisherStrand extends AS_StrandBiasTest implements AS_StandardAnnotation {
+
+ @Override
+ public List getKeyNames() {
+ return Collections.singletonList(GATKVCFConstants.AS_FISHER_STRAND_KEY);
+ }
+
+ @Override
+ protected Map calculateAnnotationFromLikelihoodMap(final Map stratifiedPerReadAlleleLikelihoodMap,
+ final VariantContext vc) {
+ // either SNP with no alignment context, or indels: per-read likelihood map needed
+ final int[][] table = getContingencyTable(stratifiedPerReadAlleleLikelihoodMap, vc, MIN_COUNT);
+ //logger.info("VC " + vc);
+ //printTable(table, 0.0);
+ return pValueAnnotationForBestTable(table, null);
+ }
+
+ /**
+ * Create an annotation for the highest (i.e., least significant) p-value of table1 and table2
+ *
+ * @param table1 a contingency table, may be null
+ * @param table2 a contingency table, may be null
+ * @return annotation result for FS given tables
+ */
+ private Map pValueAnnotationForBestTable(final int[][] table1, final int[][] table2) {
+ if ( table2 == null )
+ return table1 == null ? null : annotationForOneTable(StrandBiasTableUtils.FisherExactPValueForContingencyTable(table1));
+ else if (table1 == null)
+ return annotationForOneTable(StrandBiasTableUtils.FisherExactPValueForContingencyTable(table2));
+ else { // take the one with the best (i.e., least significant pvalue)
+ double pvalue1 = StrandBiasTableUtils.FisherExactPValueForContingencyTable(table1);
+ double pvalue2 = StrandBiasTableUtils.FisherExactPValueForContingencyTable(table2);
+ return annotationForOneTable(Math.max(pvalue1, pvalue2));
+ }
+ }
+
+ /**
+ * Returns an annotation result given a pValue
+ *
+ * @param pValue
+ * @return a hash map from FS -> phred-scaled pValue
+ */
+ protected Map annotationForOneTable(final double pValue) {
+ final Object value = String.format("%.3f", QualityUtils.phredScaleErrorRate(Math.max(pValue, MIN_PVALUE))); // prevent INFINITYs
+ return Collections.singletonMap(getKeyNames().get(0), value);
+ }
+
+ @Override
+ protected Map calculateReducedData(AlleleSpecificAnnotationData> combinedData) {
+ final Map annotationMap = new HashMap<>();
+ final Map> perAlleleData = combinedData.getAttributeMap();
+ final List refStrandCounts = perAlleleData.get(combinedData.getRefAllele());
+ for (final Allele a : perAlleleData.keySet()) {
+ if(a.equals(combinedData.getRefAllele(),true))
+ continue;
+ final List altStrandCounts = combinedData.getAttribute(a);
+ final int[][] refAltTable = new int[][] {new int[]{refStrandCounts.get(0),refStrandCounts.get(1)},new int[]{altStrandCounts.get(0),altStrandCounts.get(1)}};
+ annotationMap.put(a,QualityUtils.phredScaleErrorRate(Math.max(StrandBiasTableUtils.FisherExactPValueForContingencyTable(refAltTable), MIN_PVALUE)));
+ }
+ return annotationMap;
+ }
+
+
+}
diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_MappingQualityRankSumTest.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_MappingQualityRankSumTest.java
new file mode 100644
index 000000000..b3255fdf1
--- /dev/null
+++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_MappingQualityRankSumTest.java
@@ -0,0 +1,87 @@
+/*
+* By downloading the PROGRAM you agree to the following terms of use:
+*
+* BROAD INSTITUTE
+* SOFTWARE LICENSE AGREEMENT
+* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
+*
+* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”).
+*
+* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
+* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
+* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
+*
+* 1. DEFINITIONS
+* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE.
+*
+* 2. LICENSE
+* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation.
+* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
+* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
+* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
+*
+* 3. PHONE-HOME FEATURE
+* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation.
+*
+* 4. OWNERSHIP OF INTELLECTUAL PROPERTY
+* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
+* Copyright 2012-2015 Broad Institute, Inc.
+* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
+* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
+*
+* 5. INDEMNIFICATION
+* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
+*
+* 6. NO REPRESENTATIONS OR WARRANTIES
+* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
+* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
+*
+* 7. ASSIGNMENT
+* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
+*
+* 8. MISCELLANEOUS
+* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
+* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
+* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
+* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
+* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
+* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
+* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
+*/
+
+package org.broadinstitute.gatk.tools.walkers.annotator;
+
+import htsjdk.variant.variantcontext.Allele;
+import htsjdk.variant.variantcontext.VariantContext;
+import htsjdk.variant.vcf.VCFInfoHeaderLine;
+import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AS_StandardAnnotation;
+import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.StandardAnnotation;
+import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap;
+import org.broadinstitute.gatk.utils.pileup.PileupElement;
+import org.broadinstitute.gatk.utils.sam.GATKSAMRecord;
+import org.broadinstitute.gatk.utils.variant.GATKVCFConstants;
+import org.broadinstitute.gatk.utils.variant.GATKVCFHeaderLines;
+
+import java.util.Arrays;
+import java.util.List;
+import java.util.Map;
+
+
+/**
+ * Allele specific Rank Sum Test for mapping qualities of REF versus each ALT reads
+ *
+ * Currently this annotation duplicate the MappingQualityRankSumTest annotation
+ *
+ */
+public class AS_MappingQualityRankSumTest extends AS_RankSumTest implements AS_StandardAnnotation {
+ @Override
+ public List getKeyNames() { return Arrays.asList(GATKVCFConstants.AS_MAP_QUAL_RANK_SUM_KEY); }
+
+ @Override
+ public String getRawKeyName() { return GATKVCFConstants.AS_RAW_MAP_QUAL_RANK_SUM_KEY;}
+
+ @Override
+ protected Double getElementForRead(final GATKSAMRecord read, final int refLoc) {
+ return (double)read.getMappingQuality();
+ }
+}
\ No newline at end of file
diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_RMSAnnotation.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_RMSAnnotation.java
new file mode 100644
index 000000000..33930d2a7
--- /dev/null
+++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_RMSAnnotation.java
@@ -0,0 +1,192 @@
+/*
+* By downloading the PROGRAM you agree to the following terms of use:
+*
+* BROAD INSTITUTE
+* SOFTWARE LICENSE AGREEMENT
+* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
+*
+* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”).
+*
+* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
+* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
+* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
+*
+* 1. DEFINITIONS
+* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE.
+*
+* 2. LICENSE
+* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation.
+* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
+* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
+* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
+*
+* 3. PHONE-HOME FEATURE
+* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation.
+*
+* 4. OWNERSHIP OF INTELLECTUAL PROPERTY
+* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
+* Copyright 2012-2015 Broad Institute, Inc.
+* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
+* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
+*
+* 5. INDEMNIFICATION
+* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
+*
+* 6. NO REPRESENTATIONS OR WARRANTIES
+* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
+* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
+*
+* 7. ASSIGNMENT
+* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
+*
+* 8. MISCELLANEOUS
+* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
+* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
+* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
+* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
+* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
+* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
+* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
+*/
+
+package org.broadinstitute.gatk.tools.walkers.annotator;
+
+import htsjdk.variant.variantcontext.Allele;
+import htsjdk.variant.variantcontext.Genotype;
+import htsjdk.variant.variantcontext.GenotypesContext;
+import htsjdk.variant.variantcontext.VariantContext;
+import htsjdk.variant.vcf.VCFHeaderLine;
+import htsjdk.variant.vcf.VCFInfoHeaderLine;
+import org.apache.log4j.Logger;
+import org.broadinstitute.gatk.engine.GenomeAnalysisEngine;
+import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible;
+import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap;
+import org.broadinstitute.gatk.utils.variant.GATKVCFHeaderLines;
+
+import java.util.*;
+
+/**
+ * Allele-specific implementation of root-mean-squared annotations
+ */
+public abstract class AS_RMSAnnotation extends RMSAnnotation {
+ protected final static Logger logger = Logger.getLogger(AS_RMSAnnotation.class);
+ protected final String splitDelim = "\\|"; //String.split takes a regex, so we need to escape the pipe
+ protected final String printDelim = "|";
+ protected AnnotatorCompatible callingWalker;
+
+
+ @Override
+ public void initialize(final AnnotatorCompatible walker, final GenomeAnalysisEngine toolkit, final Set headerLines) {
+ if (!AnnotationUtils.walkerSupportsAlleleSpecificAnnotations(walker))
+ logger.warn("Allele-specific annotations can only be used with HaplotypeCaller, CombineGVCFs and GenotypeGVCFs -- no data will be output");
+ callingWalker = walker;
+ }
+
+ public List getDescriptions() {
+ if (AnnotationUtils.walkerRequiresRawData(callingWalker))
+ return Arrays.asList(GATKVCFHeaderLines.getInfoLine(getRawKeyName()));
+ else
+ return Arrays.asList(GATKVCFHeaderLines.getInfoLine(getKeyNames().get(0)));
+ }
+
+ //For the raw data here, we're only keeping track of the sum of the squares of our values
+ //When we go to reduce, we'll use the AD info to get the number of reads
+ public void calculateRawData(final VariantContext vc,
+ final Map perReadAlleleLikelihoodMap,
+ final ReducibleAnnotationData myData) {
+
+ //must use perReadAlleleLikelihoodMap for allele-specific annotations
+ if (perReadAlleleLikelihoodMap != null) {
+ if ( perReadAlleleLikelihoodMap.size() == 0 )
+ return;
+ getRMSDataFromPRALM(perReadAlleleLikelihoodMap, myData);
+ }
+ else
+ return;
+ }
+
+ abstract void getRMSDataFromPRALM(final Map perReadAlleleLikelihoodMap, final ReducibleAnnotationData myData);
+
+ @Override
+ public Map finalizeRawData(final VariantContext vc, final VariantContext originalVC) {
+ if (!vc.hasAttribute(getRawKeyName()))
+ return new HashMap<>();
+ final String rawMQdata = vc.getAttributeAsString(getRawKeyName(),null);
+ if (rawMQdata == null)
+ return new HashMap<>();
+
+ final Map annotations = new HashMap<>();
+ final ReducibleAnnotationData myData = new AlleleSpecificAnnotationData(originalVC.getAlleles(), rawMQdata);
+ parseRawDataString(myData);
+
+ final String annotationString = makeFinalizedAnnotationString(vc, myData.getAttributeMap());
+ annotations.put(getKeyNames().get(0), annotationString);
+ return annotations;
+ }
+
+ @Override
+ protected void parseRawDataString(final ReducibleAnnotationData myData) {
+ final String rawDataString = myData.getRawData();
+ //get per-allele data by splitting on allele delimiter
+ final String[] rawDataPerAllele = rawDataString.split(splitDelim);
+ for (int i=0; i combineRawData(final List vcAlleles, final List extends ReducibleAnnotationData> annotationList) {
+ //VC already contains merged alleles from ReferenceConfidenceVariantContextMerger
+ ReducibleAnnotationData combinedData = new AlleleSpecificAnnotationData(vcAlleles, null);
+
+ for (final ReducibleAnnotationData currentValue : annotationList) {
+ parseRawDataString(currentValue);
+ combineAttributeMap(currentValue, combinedData);
+
+ }
+ final Map annotations = new HashMap<>();
+ String annotationString = makeRawAnnotationString(vcAlleles, combinedData.getAttributeMap());
+ annotations.put(getRawKeyName(), annotationString);
+ return annotations;
+ }
+
+ @Override
+ public void combineAttributeMap(final ReducibleAnnotationData toAdd, final ReducibleAnnotationData combined) {
+ //check that alleles match
+ for (final Allele currentAllele : combined.getAlleles()){
+ //combined is initialized with all alleles, but toAdd might have only a subset
+ if(toAdd.getAttribute(currentAllele) == null)
+ continue;
+ if (toAdd.getAttribute(currentAllele) != null && combined.getAttribute(currentAllele) != null) {
+ combined.putAttribute(currentAllele, (double) combined.getAttribute(currentAllele) + (double) toAdd.getAttribute(currentAllele));
+ }
+ else
+ combined.putAttribute(currentAllele, toAdd.getAttribute(currentAllele));
+ }
+ }
+
+ protected Map getADcounts(final VariantContext vc) {
+ final GenotypesContext genotypes = vc.getGenotypes();
+ if ( genotypes == null || genotypes.size() == 0 ) {
+ logger.warn("VC does not have genotypes -- annotations were calculated in wrong order");
+ return null;
+ }
+
+ final Map variantADs = new HashMap<>();
+ for(final Allele a : vc.getAlleles())
+ variantADs.put(a,0);
+
+ for (final Genotype gt : vc.getGenotypes()) {
+ if(!gt.hasAD()) {
+ continue;
+ }
+ final int[] ADs = gt.getAD();
+ for(int i = 1; i < vc.getNAlleles(); i++) {
+ variantADs.put(vc.getAlternateAllele(i-1), variantADs.get(vc.getAlternateAllele(i-1))+ADs[i]); //here -1 is to reconcile allele index with alt allele index
+ }
+ }
+ return variantADs;
+ }
+}
diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_RMSMappingQuality.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_RMSMappingQuality.java
new file mode 100644
index 000000000..1b44d66bc
--- /dev/null
+++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_RMSMappingQuality.java
@@ -0,0 +1,147 @@
+/*
+* By downloading the PROGRAM you agree to the following terms of use:
+*
+* BROAD INSTITUTE
+* SOFTWARE LICENSE AGREEMENT
+* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
+*
+* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”).
+*
+* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
+* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
+* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
+*
+* 1. DEFINITIONS
+* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE.
+*
+* 2. LICENSE
+* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation.
+* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
+* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
+* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
+*
+* 3. PHONE-HOME FEATURE
+* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation.
+*
+* 4. OWNERSHIP OF INTELLECTUAL PROPERTY
+* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
+* Copyright 2012-2015 Broad Institute, Inc.
+* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
+* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
+*
+* 5. INDEMNIFICATION
+* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
+*
+* 6. NO REPRESENTATIONS OR WARRANTIES
+* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
+* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
+*
+* 7. ASSIGNMENT
+* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
+*
+* 8. MISCELLANEOUS
+* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
+* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
+* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
+* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
+* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
+* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
+* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
+*/
+
+package org.broadinstitute.gatk.tools.walkers.annotator;
+
+import htsjdk.variant.variantcontext.Allele;
+import htsjdk.variant.variantcontext.VariantContext;
+import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.*;
+import org.broadinstitute.gatk.utils.QualityUtils;
+import org.broadinstitute.gatk.utils.contexts.AlignmentContext;
+import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap;
+import org.broadinstitute.gatk.utils.sam.GATKSAMRecord;
+import org.broadinstitute.gatk.utils.variant.GATKVCFConstants;
+
+import java.util.*;
+
+
+/**
+ * Allele-specific Root Mean Square of the mapping quality of reads across all samples.
+ *
+ *
This annotation provides an estimation of the mapping quality of reads supporting each alternate allele in a variant call. Depending on the tool it is called from, it produces either raw data (sum of squared MQs) or the calculated root mean square.
+ *
+ * The raw data is used to accurately calculate the root mean square when combining more than one sample.
+ *
+ *
Statistical notes
+ *
The root mean square is equivalent to the mean of the mapping qualities plus the standard deviation of the mapping qualities.
Uninformative reads are not used in these annotations.
+ *
+ */
+public class AS_RMSMappingQuality extends AS_RMSAnnotation implements AS_StandardAnnotation, ActiveRegionBasedAnnotation {
+
+ protected final String printFormat = "%.2f";
+
+ public List getKeyNames() { return Arrays.asList(GATKVCFConstants.AS_RMS_MAPPING_QUALITY_KEY); }
+
+ public String getRawKeyName() { return GATKVCFConstants.AS_RAW_RMS_MAPPING_QUALITY_KEY; }
+
+ public void getRMSDataFromPRALM(Map perReadAlleleLikelihoodMap, ReducibleAnnotationData myData) {
+ //over all the samples in the Map...
+ for ( final PerReadAlleleLikelihoodMap perReadLikelihoods : perReadAlleleLikelihoodMap.values() ) {
+ //for each read...
+ for ( final Map.Entry> readLikelihoods : perReadLikelihoods.getLikelihoodReadMap().entrySet() ) {
+ final int mq = readLikelihoods.getKey().getMappingQuality();
+ if ( mq != QualityUtils.MAPPING_QUALITY_UNAVAILABLE ) {
+ if (!PerReadAlleleLikelihoodMap.getMostLikelyAllele(readLikelihoods.getValue()).isInformative())
+ continue;
+ final Allele bestAllele =PerReadAlleleLikelihoodMap.getMostLikelyAllele(readLikelihoods.getValue()).getMostLikelyAllele();
+ double currSquareSum = 0;
+ if (myData.hasAttribute(bestAllele))
+ currSquareSum += (double)myData.getAttribute(bestAllele);
+ myData.putAttribute(bestAllele, currSquareSum + mq * mq);
+ }
+ }
+ }
+ }
+
+ @Override
+ public String makeRawAnnotationString(final List vcAlleles, final Map perAlleleValues) {
+ String annotationString = "";
+ for (final Allele current : vcAlleles) {
+ if (!annotationString.isEmpty())
+ annotationString += printDelim;
+ if(perAlleleValues.get(current) != null)
+ annotationString += String.format(printFormat,perAlleleValues.get(current));
+ else
+ annotationString += String.format(printFormat, 0.0);
+ }
+ return annotationString;
+ }
+
+ //this just overrides the RMSAnnotation function that's used for UG -- we don't do allele-specific annotations for UG
+ @Override
+ public String makeFinalizedAnnotationString(final VariantContext vc, final Map perAlleleData, final Map stratifiedContexts, final Map perReadAlleleLikelihoodMap) {
+ return makeFinalizedAnnotationString(vc, perAlleleData);
+ }
+
+ @Override
+ public String makeFinalizedAnnotationString(final VariantContext vc, final Map perAlleleValues) {
+ final Map variantADs = getADcounts(vc);
+ String annotationString = "";
+ for (final Allele current : vc.getAlternateAlleles()) {
+ if (!annotationString.isEmpty())
+ annotationString += ",";
+ if (perAlleleValues.containsKey(current))
+ annotationString += String.format(printFormat, Math.sqrt((double) perAlleleValues.get(current) / variantADs.get(current)));
+ else {
+ logger.warn("ERROR: VC allele is not found in annotation alleles -- maybe there was trimming?");
+ }
+ }
+ return annotationString;
+ }
+}
\ No newline at end of file
diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_RankSumTest.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_RankSumTest.java
new file mode 100644
index 000000000..637f4574f
--- /dev/null
+++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_RankSumTest.java
@@ -0,0 +1,329 @@
+/*
+* By downloading the PROGRAM you agree to the following terms of use:
+*
+* BROAD INSTITUTE
+* SOFTWARE LICENSE AGREEMENT
+* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
+*
+* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”).
+*
+* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
+* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
+* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
+*
+* 1. DEFINITIONS
+* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE.
+*
+* 2. LICENSE
+* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation.
+* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
+* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
+* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
+*
+* 3. PHONE-HOME FEATURE
+* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation.
+*
+* 4. OWNERSHIP OF INTELLECTUAL PROPERTY
+* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
+* Copyright 2012-2015 Broad Institute, Inc.
+* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
+* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
+*
+* 5. INDEMNIFICATION
+* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
+*
+* 6. NO REPRESENTATIONS OR WARRANTIES
+* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
+* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
+*
+* 7. ASSIGNMENT
+* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
+*
+* 8. MISCELLANEOUS
+* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
+* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
+* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
+* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
+* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
+* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
+* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
+*/
+
+package org.broadinstitute.gatk.tools.walkers.annotator;
+
+import htsjdk.variant.variantcontext.Allele;
+import htsjdk.variant.variantcontext.VariantContext;
+import htsjdk.variant.vcf.VCFHeaderLine;
+import htsjdk.variant.vcf.VCFInfoHeaderLine;
+import org.apache.log4j.Logger;
+import org.broadinstitute.gatk.engine.GenomeAnalysisEngine;
+import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible;
+import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.ReducibleAnnotation;
+import org.broadinstitute.gatk.tools.walkers.haplotypecaller.HaplotypeCaller;
+import org.broadinstitute.gatk.tools.walkers.variantutils.CombineGVCFs;
+import org.broadinstitute.gatk.tools.walkers.variantutils.GenotypeGVCFs;
+import org.broadinstitute.gatk.utils.MannWhitneyU;
+import org.broadinstitute.gatk.utils.collections.Pair;
+import org.broadinstitute.gatk.utils.contexts.AlignmentContext;
+import org.broadinstitute.gatk.utils.contexts.ReferenceContext;
+import org.broadinstitute.gatk.utils.exceptions.GATKException;
+import org.broadinstitute.gatk.utils.genotyper.MostLikelyAllele;
+import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap;
+import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker;
+import org.broadinstitute.gatk.utils.sam.GATKSAMRecord;
+import org.broadinstitute.gatk.utils.variant.GATKVCFHeaderLines;
+
+import java.util.*;
+
+/**
+ * Allele-specific implementation of rank sum test annotations
+ */
+public abstract class AS_RankSumTest extends RankSumTest implements ReducibleAnnotation {
+ private final static Logger logger = Logger.getLogger(AS_RMSAnnotation.class);
+ protected final String splitDelim = "\\|"; //String.split takes a regex, so we need to escape the pipe
+ protected final String printDelim = "|";
+ protected final String reducedDelim = ",";
+ protected AnnotatorCompatible callingWalker;
+
+ @Override
+ public void initialize(final AnnotatorCompatible walker, final GenomeAnalysisEngine toolkit, final Set headerLines) {
+ if (!AnnotationUtils.walkerSupportsAlleleSpecificAnnotations(walker))
+ logger.warn("Allele-specific annotations can only be used with HaplotypeCaller, CombineGVCFs and GenotypeGVCFs -- no data will be output");
+ callingWalker = walker;
+ super.initialize(walker, toolkit, headerLines);
+ }
+
+ public List getDescriptions() {
+ if (AnnotationUtils.walkerRequiresRawData(callingWalker))
+ return Arrays.asList(GATKVCFHeaderLines.getInfoLine(getRawKeyName()));
+ else
+ return Arrays.asList(GATKVCFHeaderLines.getInfoLine(getKeyNames().get(0)));
+ }
+
+ public Map annotateRawData(final RefMetaDataTracker tracker,
+ final AnnotatorCompatible walker,
+ final ReferenceContext ref,
+ final Map stratifiedContexts,
+ final VariantContext vc,
+ final Map perReadAlleleLikelihoodMap ) {
+
+ if ( perReadAlleleLikelihoodMap == null)
+ return new HashMap<>();
+
+ final Map annotations = new HashMap<>();
+ final AlleleSpecificAnnotationData> myData = initializeNewAnnotationData(vc.getAlleles());
+ calculateRawData(vc, perReadAlleleLikelihoodMap, myData);
+ final String annotationString = makeRawAnnotationString(vc.getAlleles(), myData.getAttributeMap());
+ annotations.put(getRawKeyName(), annotationString);
+ return annotations;
+ }
+
+ protected void parseRawDataString(final ReducibleAnnotationData> myData) {
+ final String rawDataString = myData.getRawData();
+ String rawDataNoBrackets;
+ final Map> perAlleleValues = new HashMap<>();
+ //Initialize maps
+ for (final Allele current : myData.getAlleles()) {
+ perAlleleValues.put(current, new CompressedDataList());
+ }
+ //Map gives back list with []
+ if (rawDataString.charAt(0) == '[') {
+ rawDataNoBrackets = rawDataString.substring(1, rawDataString.length() - 1);
+ }
+ else {
+ rawDataNoBrackets = rawDataString;
+ }
+ //rawDataPerAllele is the list of values for each allele (each of variable length)
+ final String[] rawDataPerAllele = rawDataNoBrackets.split(splitDelim);
+ for (int i=0; i alleleList = perAlleleValues.get(myData.getAlleles().get(i));
+ final String[] rawListEntriesAsStringVector = alleleData.split(",");
+ if (rawListEntriesAsStringVector.length %2 != 0)
+ throw new GATKException("ERROR: rank sum test raw annotation data must occur in pairs");
+ for (int j=0; j combineRawData(final List vcAlleles, final List extends ReducibleAnnotationData> annotationList) {
+ //VC already contains merged alleles from ReferenceConfidenceVariantContextMerger
+ final ReducibleAnnotationData combinedData = initializeNewAnnotationData(vcAlleles);
+
+ for (final ReducibleAnnotationData currentValue : annotationList) {
+ parseRawDataString(currentValue);
+ combineAttributeMap(currentValue, combinedData);
+
+ }
+ final Map annotations = new HashMap<>();
+ final String annotationString = makeRawAnnotationString(vcAlleles, combinedData.getAttributeMap());
+ annotations.put(getRawKeyName(), annotationString);
+ return annotations;
+ }
+
+ protected AlleleSpecificAnnotationData initializeNewAnnotationData(final List vcAlleles) {
+ Map> perAlleleValues = new HashMap<>();
+ for (Allele a : vcAlleles) {
+ perAlleleValues.put(a, new CompressedDataList());
+ }
+ AlleleSpecificAnnotationData ret = new AlleleSpecificAnnotationData(vcAlleles, perAlleleValues.toString());
+ ret.setAttributeMap(perAlleleValues);
+ return ret;
+ }
+
+ protected void combineAttributeMap(final ReducibleAnnotationData> toAdd, final ReducibleAnnotationData> combined) {
+ for (final Allele a : combined.getAlleles()) {
+ if (toAdd.hasAttribute(a)) {
+ final CompressedDataList alleleData = combined.getAttribute(a);
+ alleleData.add(toAdd.getAttribute(a));
+ combined.putAttribute(a, alleleData);
+ }
+ }
+ }
+
+ protected String makeRawAnnotationString(final List vcAlleles, final Map> perAlleleValues) {
+ String annotationString = "";
+ for (int i =0; i< vcAlleles.size(); i++) {
+ if (i!=0)
+ annotationString += printDelim;
+ CompressedDataList alleleValues = perAlleleValues.get(vcAlleles.get(i));
+ annotationString += alleleValues.toString();
+ }
+ return annotationString;
+ }
+
+ protected String makeReducedAnnotationString(VariantContext vc, Map perAltRankSumResults) {
+ String annotationString = "";
+ for (final Allele a : vc.getAlternateAlleles()) {
+ if (!annotationString.isEmpty())
+ annotationString += reducedDelim;
+ if (!perAltRankSumResults.containsKey(a))
+ logger.warn("ERROR: VC allele not found in annotation alleles -- maybe there was trimming?");
+ else
+ annotationString += String.format("%.3f", perAltRankSumResults.get(a));
+ }
+ return annotationString;
+ }
+
+ /**
+ *
+ * @param vc -- contains the final set of alleles, possibly subset by GenotypeGVCFs
+ * @param originalVC -- used to get all the alleles for all gVCFs
+ * @return
+ */
+ public Map finalizeRawData(final VariantContext vc, final VariantContext originalVC) {
+ if (!vc.hasAttribute(getRawKeyName()))
+ return new HashMap<>();
+
+ final String rawRankSumData = vc.getAttributeAsString(getRawKeyName(),null);
+ if (rawRankSumData == null)
+ return new HashMap<>();
+
+ final Map annotations = new HashMap<>();
+ final AlleleSpecificAnnotationData> myData = new AlleleSpecificAnnotationData(originalVC.getAlleles(), rawRankSumData);
+ parseRawDataString(myData);
+
+ final Map perAltRankSumResults = calculateReducedData(myData.getAttributeMap(), myData.getRefAllele());
+ //shortcut for no ref values
+ if (perAltRankSumResults.isEmpty())
+ return annotations;
+ final String annotationString = makeReducedAnnotationString(vc, perAltRankSumResults);
+ annotations.put(getKeyNames().get(0), annotationString);
+ return annotations;
+ }
+
+ public void calculateRawData(VariantContext vc, Map pralm, ReducibleAnnotationData myData) {
+ if(pralm == null)
+ return;
+
+ final Map> perAlleleValues = myData.getAttributeMap();
+ for ( final PerReadAlleleLikelihoodMap likelihoodMap : pralm.values() ) {
+ if ( likelihoodMap != null && !likelihoodMap.isEmpty() ) {
+ fillQualsFromLikelihoodMap(vc.getAlleles(), vc.getStart(), likelihoodMap, perAlleleValues);
+ }
+ }
+
+ }
+
+ private void fillQualsFromLikelihoodMap(final List alleles,
+ final int refLoc,
+ final PerReadAlleleLikelihoodMap likelihoodMap,
+ final Map> perAlleleValues) {
+ for ( final Map.Entry> el : likelihoodMap.getLikelihoodReadMap().entrySet() ) {
+ final MostLikelyAllele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue());
+ if ( ! a.isInformative() )
+ continue; // read is non-informative
+
+ final GATKSAMRecord read = el.getKey();
+ if ( isUsableRead(read, refLoc) ) {
+ final Double value = getElementForRead(read, refLoc, a);
+ if ( value == null )
+ continue;
+
+ if(perAlleleValues.containsKey(a.getMostLikelyAllele()))
+ perAlleleValues.get(a.getMostLikelyAllele()).add(value.intValue());
+ }
+ }
+ }
+
+ public Map calculateReducedData(final Map> perAlleleValues, final Allele ref) {
+ final Map perAltRankSumResults = new HashMap<>();
+ //shortcut to not try to calculate rank sum if there are no reads that unambiguously support the ref
+ if (perAlleleValues.get(ref).isEmpty())
+ return perAltRankSumResults;
+ for (final Allele alt : perAlleleValues.keySet()) {
+ if (alt.equals(ref, false))
+ continue;
+ final MannWhitneyU mannWhitneyU = new MannWhitneyU(useDithering);
+ //load alts
+ for (final Number qual : perAlleleValues.get(alt)) {
+ mannWhitneyU.add(qual, MannWhitneyU.USet.SET1);
+ }
+ //load refs
+ for (final Number qual : perAlleleValues.get(ref)) {
+ mannWhitneyU.add(qual, MannWhitneyU.USet.SET2);
+ }
+
+ if (DEBUG) {
+ System.out.format("%s, REF QUALS:", this.getClass().getName());
+ for (final Number qual : perAlleleValues.get(ref))
+ System.out.format("%d ", qual);
+ System.out.println();
+ System.out.format("%s, ALT QUALS:", this.getClass().getName());
+ for (final Number qual : perAlleleValues.get(alt))
+ System.out.format("%d ", qual);
+ System.out.println();
+
+ }
+ // we are testing that set1 (the alt bases) have lower quality scores than set2 (the ref bases)
+ final Pair testResults = mannWhitneyU.runOneSidedTest(MannWhitneyU.USet.SET1);
+ perAltRankSumResults.put(alt, testResults.first);
+ }
+ return perAltRankSumResults;
+ }
+
+}
diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_ReadPosRankSumTest.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_ReadPosRankSumTest.java
new file mode 100644
index 000000000..5cd4e060b
--- /dev/null
+++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_ReadPosRankSumTest.java
@@ -0,0 +1,107 @@
+/*
+* By downloading the PROGRAM you agree to the following terms of use:
+*
+* BROAD INSTITUTE
+* SOFTWARE LICENSE AGREEMENT
+* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
+*
+* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”).
+*
+* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
+* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
+* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
+*
+* 1. DEFINITIONS
+* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE.
+*
+* 2. LICENSE
+* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation.
+* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
+* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
+* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
+*
+* 3. PHONE-HOME FEATURE
+* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation.
+*
+* 4. OWNERSHIP OF INTELLECTUAL PROPERTY
+* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
+* Copyright 2012-2015 Broad Institute, Inc.
+* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
+* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
+*
+* 5. INDEMNIFICATION
+* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
+*
+* 6. NO REPRESENTATIONS OR WARRANTIES
+* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
+* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
+*
+* 7. ASSIGNMENT
+* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
+*
+* 8. MISCELLANEOUS
+* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
+* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
+* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
+* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
+* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
+* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
+* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
+*/
+
+package org.broadinstitute.gatk.tools.walkers.annotator;
+
+
+import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AS_StandardAnnotation;
+import org.broadinstitute.gatk.utils.pileup.PileupElement;
+import org.broadinstitute.gatk.utils.sam.AlignmentUtils;
+import org.broadinstitute.gatk.utils.sam.GATKSAMRecord;
+import org.broadinstitute.gatk.utils.sam.ReadUtils;
+import org.broadinstitute.gatk.utils.variant.GATKVCFConstants;
+
+import java.util.Arrays;
+import java.util.List;
+
+/**
+ * Allele-specific Rank Sum Test for relative positioning of REF versus each ALT allele within reads
+ *
+ *
This variant-level annotation tests whether there is evidence of bias in the position of alleles within the reads that support them, between the reference and each alternate allele. Seeing an allele only near the ends of reads is indicative of error, because that is where sequencers tend to make the most errors. However, some variants located near the edges of sequenced regions will necessarily be covered by the ends of reads, so we can't just set an absolute "minimum distance from end of read" threshold. That is why we use a rank sum test to evaluate whether there is a difference in how well the reference allele and the alternate allele are supported.
+ *
+ *
The ideal result is a value close to zero, which indicates there is little to no difference in where the alleles are found relative to the ends of reads. A negative value indicates that the alternate allele is found at the ends of reads more often than the reference allele. Conversely, a positive value indicates that the reference allele is found at the ends of reads more often than the alternate allele.
+ *
+ *
This annotation can be used to evaluate confidence in a variant call and is a recommended covariate for variant recalibration (VQSR). Finding a statistically significant difference in relative position either way suggests that the sequencing process may have been biased or affected by an artifact. In practice, we only filter out low negative values when evaluating variant quality because the idea is to filter out variants for which the quality of the data supporting the alternate allele is comparatively low. The reverse case, where it is the quality of data supporting the reference allele that is lower (resulting in positive ranksum scores), is not really informative for filtering variants.
+ *
+ *
Statistical notes
+ *
The value output for this annotation is the u-based z-approximation from the Mann-Whitney-Wilcoxon Rank Sum Test for site position within reads (position within reads supporting REF vs. position within reads supporting ALT). See the method document on statistical tests for a more detailed explanation of the ranksum test.
+ *
+ *
Caveat
+ *
Uninformative reads are not used in these annotations.
+ *
+ *
+ */
+public class AS_ReadPosRankSumTest extends AS_RankSumTest implements AS_StandardAnnotation {
+
+ @Override
+ public List getKeyNames() { return Arrays.asList(GATKVCFConstants.AS_READ_POS_RANK_SUM_KEY); }
+
+ @Override
+ public String getRawKeyName() { return GATKVCFConstants.AS_RAW_READ_POS_RANK_SUM_KEY;}
+
+ @Override
+ protected Double getElementForRead(final GATKSAMRecord read, final int refLoc) {
+ final int offset = ReadUtils.getReadCoordinateForReferenceCoordinate(read.getSoftStart(), read.getCigar(), refLoc, ReadUtils.ClippingTail.RIGHT_TAIL, true);
+ if ( offset == ReadUtils.CLIPPING_GOAL_NOT_REACHED )
+ return null;
+
+ int readPos = AlignmentUtils.calcAlignmentByteArrayOffset(read.getCigar(), offset, false, 0, 0);
+ final int numAlignedBases = AlignmentUtils.getNumAlignedBasesCountingSoftClips( read );
+ if (readPos > numAlignedBases / 2)
+ readPos = numAlignedBases - (readPos + 1);
+ return (double)readPos;
+ }
+
+ @Override
+ protected boolean isUsableRead(final GATKSAMRecord read, final int refLoc) {
+ return super.isUsableRead(read, refLoc) && read.getSoftStart() + read.getCigar().getReadLength() > refLoc;
+ }
+}
diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_StrandBiasTest.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_StrandBiasTest.java
new file mode 100644
index 000000000..acda6c7ad
--- /dev/null
+++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_StrandBiasTest.java
@@ -0,0 +1,379 @@
+/*
+* By downloading the PROGRAM you agree to the following terms of use:
+*
+* BROAD INSTITUTE
+* SOFTWARE LICENSE AGREEMENT
+* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
+*
+* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”).
+*
+* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
+* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
+* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
+*
+* 1. DEFINITIONS
+* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE.
+*
+* 2. LICENSE
+* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation.
+* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
+* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
+* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
+*
+* 3. PHONE-HOME FEATURE
+* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation.
+*
+* 4. OWNERSHIP OF INTELLECTUAL PROPERTY
+* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
+* Copyright 2012-2015 Broad Institute, Inc.
+* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
+* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
+*
+* 5. INDEMNIFICATION
+* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
+*
+* 6. NO REPRESENTATIONS OR WARRANTIES
+* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
+* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
+*
+* 7. ASSIGNMENT
+* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
+*
+* 8. MISCELLANEOUS
+* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
+* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
+* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
+* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
+* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
+* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
+* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
+*/
+
+package org.broadinstitute.gatk.tools.walkers.annotator;
+
+import htsjdk.variant.variantcontext.Allele;
+import htsjdk.variant.variantcontext.GenotypesContext;
+import htsjdk.variant.variantcontext.VariantContext;
+import htsjdk.variant.vcf.VCFHeaderLine;
+import htsjdk.variant.vcf.VCFInfoHeaderLine;
+import org.apache.log4j.Logger;
+import org.broadinstitute.gatk.engine.GenomeAnalysisEngine;
+import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible;
+import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.ReducibleAnnotation;
+import org.broadinstitute.gatk.utils.contexts.AlignmentContext;
+import org.broadinstitute.gatk.utils.contexts.ReferenceContext;
+import org.broadinstitute.gatk.utils.exceptions.GATKException;
+import org.broadinstitute.gatk.utils.genotyper.MostLikelyAllele;
+import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap;
+import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker;
+import org.broadinstitute.gatk.utils.sam.GATKSAMRecord;
+import org.broadinstitute.gatk.utils.variant.GATKVCFConstants;
+import org.broadinstitute.gatk.utils.variant.GATKVCFHeaderLines;
+
+import java.util.*;
+
+/**
+ * Allele-specific implementation of strand bias annotations
+ */
+public abstract class AS_StrandBiasTest extends StrandBiasTest implements ReducibleAnnotation {
+ private final static Logger logger = Logger.getLogger(StrandBiasTest.class);
+ protected final String splitDelim = "\\|"; //String.split takes a regex, so we need to escape the pipe
+ protected final String printDelim = "|";
+ protected final String reducedDelim = ",";
+ protected AnnotatorCompatible callingWalker;
+ protected final int MIN_COUNT = 2;
+ protected static final double MIN_PVALUE = 1E-320;
+ protected final int FORWARD = 0;
+ protected final int REVERSE = 1;
+ protected final ArrayList ZERO_LIST = new ArrayList<>();
+
+ @Override
+ public void initialize(final AnnotatorCompatible walker, final GenomeAnalysisEngine toolkit, final Set headerLines) {
+ if (!AnnotationUtils.walkerSupportsAlleleSpecificAnnotations(walker))
+ logger.warn("Allele-specific annotations can only be used with HaplotypeCaller, CombineGVCFs and GenotypeGVCFs -- no data will be output");
+ callingWalker = walker;
+ ZERO_LIST.add(0,0);
+ ZERO_LIST.add(1,0);
+ }
+
+ @Override
+ public List getDescriptions() {
+ if (AnnotationUtils.walkerRequiresRawData(callingWalker))
+ return Arrays.asList(GATKVCFHeaderLines.getInfoLine(getRawKeyName()));
+ else
+ return Arrays.asList(GATKVCFHeaderLines.getInfoLine(getKeyNames().get(0)));
+ }
+
+ @Override
+ public String getRawKeyName() { return GATKVCFConstants.AS_SB_TABLE_KEY; }
+
+ public Map annotateRawData(final RefMetaDataTracker tracker,
+ final AnnotatorCompatible walker,
+ final ReferenceContext ref,
+ final Map stratifiedContexts,
+ final VariantContext vc,
+ final Map perReadAlleleLikelihoodMap ) {
+
+ //for allele-specific annotations we only call from HC and we only use perReadAlleleLikelihoodMap
+ if ( perReadAlleleLikelihoodMap == null)
+ return new HashMap<>();
+
+ // calculate the annotation from the stratified per read likelihood map
+ // stratifiedPerReadAllelelikelihoodMap can come from HaplotypeCaller call to VariantAnnotatorEngine
+ else if (perReadAlleleLikelihoodMap != null) {
+ final HashMap annotations = new HashMap<>();
+ final ReducibleAnnotationData> myData = new AlleleSpecificAnnotationData<>(vc.getAlleles(),null);
+ calculateRawData(vc, perReadAlleleLikelihoodMap, myData);
+ final Map> perAlleleValues = myData.getAttributeMap();
+ final String annotationString = makeRawAnnotationString(vc.getAlleles(), perAlleleValues);
+ annotations.put(getRawKeyName(), annotationString);
+ return annotations;
+ }
+ else {
+ // for non-snp variants, we need per-read likelihoods.
+ // for snps, we can get same result from simple pileup
+ // for indels that do not have a computed strand bias (SB) or strand bias by sample (SBBS)
+ return null;
+ }
+ }
+
+ protected void parseRawDataString(ReducibleAnnotationData> myData) {
+ final String rawDataString = myData.getRawData();
+ String[] rawDataPerAllele;
+ String[] rawListEntriesAsStringVector;
+ Map> perAlleleValues = new HashMap<>();
+ //Initialize maps
+ for (Allele current : myData.getAlleles()) {
+ perAlleleValues.put(current, new LinkedList());
+ }
+ //rawDataPerAllele is the list of values for each allele (each of variable length)
+ rawDataPerAllele = rawDataString.split(splitDelim);
+ for (int i=0; i alleleList = perAlleleValues.get(myData.getAlleles().get(i));
+ rawListEntriesAsStringVector = alleleData.split(",");
+ //Read counts will only ever be integers
+ for (String s : rawListEntriesAsStringVector) {
+ if (!s.isEmpty())
+ alleleList.add(Integer.parseInt(s.trim()));
+ }
+ }
+ myData.setAttributeMap(perAlleleValues);
+ }
+
+ @Override
+ public Map combineRawData(final List vcAlleles, final List extends ReducibleAnnotationData> annotationList) {
+ //VC already contains merged alleles from ReferenceConfidenceVariantContextMerger
+ ReducibleAnnotationData combinedData = new AlleleSpecificAnnotationData(vcAlleles, null);
+
+ for (final ReducibleAnnotationData currentValue : annotationList) {
+ parseRawDataString(currentValue);
+ combineAttributeMap(currentValue, combinedData);
+ }
+ final Map annotations = new HashMap<>();
+ final String annotationString = makeRawAnnotationString(vcAlleles, combinedData.getAttributeMap());
+ annotations.put(getRawKeyName(), annotationString);
+ return annotations;
+ }
+
+ protected void combineAttributeMap(final ReducibleAnnotationData> toAdd, final ReducibleAnnotationData> combined) {
+ for (final Allele a : combined.getAlleles()) {
+ if (toAdd.hasAttribute(a) && toAdd.getAttribute(a) != null) {
+ if (combined.getAttribute(a) != null) {
+ combined.getAttribute(a).set(0, (int) combined.getAttribute(a).get(0) + (int) toAdd.getAttribute(a).get(0));
+ combined.getAttribute(a).set(1, (int) combined.getAttribute(a).get(1) + (int) toAdd.getAttribute(a).get(1));
+ }
+ else {
+ List alleleData = new ArrayList<>();
+ alleleData.add(0, toAdd.getAttribute(a).get(0));
+ alleleData.add(1, toAdd.getAttribute(a).get(1));
+ combined.putAttribute(a,alleleData);
+ }
+ }
+ }
+ }
+
+ protected String makeRawAnnotationString(final List vcAlleles, final Map> perAlleleValues) {
+ String annotationString = "";
+ for (final Allele a : vcAlleles) {
+ if (!annotationString.isEmpty())
+ annotationString += printDelim;
+ List alleleValues = perAlleleValues.get(a);
+ if (alleleValues == null)
+ alleleValues = ZERO_LIST;
+ annotationString += encode(alleleValues);
+ }
+ return annotationString;
+ }
+
+ protected String encode(List alleleValues) {
+ String annotationString = "";
+ for (int j =0; j < alleleValues.size(); j++) {
+ annotationString += alleleValues.get(j);
+ if (j < alleleValues.size()-1)
+ annotationString += ",";
+ }
+ return annotationString;
+ }
+
+
+
+ protected String makeReducedAnnotationString(VariantContext vc, Map perAltsStrandCounts) {
+ String annotationString = "";
+ for (Allele a : vc.getAlternateAlleles()) {
+ if (!annotationString.isEmpty())
+ annotationString += reducedDelim;
+ if (!perAltsStrandCounts.containsKey(a))
+ logger.warn("ERROR: VC allele not found in annotation alleles -- maybe there was trimming?");
+ else
+ annotationString += String.format("%.3f", perAltsStrandCounts.get(a));
+ }
+ return annotationString;
+ }
+
+ /**
+ *
+ * @param vc -- contains the final set of alleles, possibly subset by GenotypeGVCFs
+ * @param originalVC -- used to get all the alleles for all gVCFs
+ * @return
+ */
+ @Override
+ public Map finalizeRawData(final VariantContext vc, final VariantContext originalVC) {
+ Map annotations = new HashMap<>();
+ if (!vc.hasAttribute(getRawKeyName()))
+ return new HashMap<>();
+ String rawRankSumData = vc.getAttributeAsString(getRawKeyName(),null);
+ if (rawRankSumData == null)
+ return new HashMap<>();
+
+ AlleleSpecificAnnotationData> myData = new AlleleSpecificAnnotationData<>(originalVC.getAlleles(), rawRankSumData);
+ parseRawDataString(myData);
+
+ Map perAltRankSumResults = calculateReducedData(myData);
+
+ String annotationString = makeReducedAnnotationString(vc, perAltRankSumResults);
+ annotations.put(getKeyNames().get(0), annotationString);
+ return annotations;
+ }
+
+ @Override
+ public void calculateRawData(final VariantContext vc, Map pralm, final ReducibleAnnotationData rawAnnotations) {
+ if(pralm == null)
+ return;
+
+ getStrandCountsFromLikelihoodMap(vc, pralm, rawAnnotations, MIN_COUNT);
+ }
+
+ protected abstract Map calculateReducedData(final AlleleSpecificAnnotationData> combinedData );
+
+ /**
+ Allocate and fill a 2x2 strand contingency table. In the end, it'll look something like this:
+ * fw rc
+ * allele1 # #
+ * allele2 # #
+ * @return a 2x2 contingency table
+ */
+ public void getStrandCountsFromLikelihoodMap( final VariantContext vc,
+ final Map stratifiedPerReadAlleleLikelihoodMap,
+ final ReducibleAnnotationData perAlleleValues,
+ final int minCount) {
+ if( stratifiedPerReadAlleleLikelihoodMap == null )
+ return;
+ if( vc == null )
+ return;
+
+ final Allele ref = vc.getReference();
+ final List allAlts = vc.getAlternateAlleles();
+
+ for (final PerReadAlleleLikelihoodMap maps : stratifiedPerReadAlleleLikelihoodMap.values() ) {
+ final ReducibleAnnotationData> sampleTable = new AlleleSpecificAnnotationData<>(vc.getAlleles(),null);
+ for (final Map.Entry> el : maps.getLikelihoodReadMap().entrySet()) {
+ final MostLikelyAllele mostLikelyAllele = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue());
+ final GATKSAMRecord read = el.getKey();
+ updateTable(mostLikelyAllele.getAlleleIfInformative(), read, ref, allAlts, sampleTable);
+ }
+ //for each sample (value in stratified PRALM), only include it if there are >minCount informative reads
+ if ( passesMinimumThreshold(sampleTable, minCount) )
+ combineAttributeMap(sampleTable, perAlleleValues);
+ }
+ }
+
+ private void updateTable(final Allele bestAllele, final GATKSAMRecord read, final Allele ref, final List allAlts, final ReducibleAnnotationData> perAlleleValues) {
+
+ final boolean matchesRef = bestAllele.equals(ref, true);
+ final boolean matchesAnyAlt = allAlts.contains(bestAllele);
+
+ //for uninformative reads
+ if(bestAllele.isNoCall())
+ return;
+
+ //can happen if a read's most likely allele has been removed when --max_alternate_alleles is exceeded
+ if (!( matchesRef || matchesAnyAlt ))
+ return;
+
+ final List alleleStrandCounts;
+ if (perAlleleValues.hasAttribute(bestAllele) && perAlleleValues.getAttribute(bestAllele) != null)
+ alleleStrandCounts = perAlleleValues.getAttribute(bestAllele);
+ else {
+ alleleStrandCounts = new ArrayList<>();
+ alleleStrandCounts.add(0,0);
+ alleleStrandCounts.add(1,0);
+ }
+ if (read.isStrandless()) {
+ // a strandless read counts as observations on both strand, at 50% weight, with a minimum of 1
+ // (the 1 is to ensure that a strandless read always counts as an observation on both strands, even
+ // if the read is only seen once, because it's a merged read or other)
+ alleleStrandCounts.set(FORWARD, alleleStrandCounts.get(FORWARD)+1);
+ alleleStrandCounts.set(REVERSE, alleleStrandCounts.get(REVERSE)+1);
+ } else {
+ // a normal read with an actual strand
+ final boolean isFW = !read.getReadNegativeStrandFlag();
+ if (isFW)
+ alleleStrandCounts.set(FORWARD, alleleStrandCounts.get(FORWARD)+1);
+ else
+ alleleStrandCounts.set(REVERSE, alleleStrandCounts.get(REVERSE)+1);
+ }
+ perAlleleValues.putAttribute(bestAllele, alleleStrandCounts);
+ }
+
+ /**
+ * Does this strand data array pass the minimum threshold for inclusion?
+ *
+ * @param sampleTable the per-allele fwd/rev read counts for a single sample
+ * @param minCount The minimum threshold of counts in the array
+ * @return true if it passes the minimum threshold, false otherwise
+ */
+ protected boolean passesMinimumThreshold(final ReducibleAnnotationData> sampleTable, final int minCount) {
+ // the read total must be greater than MIN_COUNT
+ int readTotal = 0;
+ for (final List alleleValues : sampleTable.getAttributeMap().values()) {
+ if (alleleValues != null) {
+ readTotal += alleleValues.get(FORWARD);
+ readTotal += alleleValues.get(REVERSE);
+ }
+ }
+ return readTotal > minCount;
+ }
+
+
+ @Override
+ //Allele-specific annotations cannot be called from walkers other than HaplotypeCaller
+ protected Map calculateAnnotationFromGTfield(final GenotypesContext genotypes){
+ return new HashMap<>();
+ }
+
+ @Override
+ //Allele-specific annotations cannot be called from walkers other than HaplotypeCaller
+ protected Map calculateAnnotationFromStratifiedContexts(final Map stratifiedContexts,
+ final VariantContext vc){
+ return new HashMap<>();
+ }
+
+ @Override
+ //This just calls the non-allele-specific code in StrandBiasTest.java
+ protected abstract Map calculateAnnotationFromLikelihoodMap(final Map stratifiedPerReadAlleleLikelihoodMap,
+ final VariantContext vc);
+
+}
diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_StrandOddsRatio.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_StrandOddsRatio.java
new file mode 100644
index 000000000..40cfe39b2
--- /dev/null
+++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AS_StrandOddsRatio.java
@@ -0,0 +1,123 @@
+/*
+* By downloading the PROGRAM you agree to the following terms of use:
+*
+* BROAD INSTITUTE
+* SOFTWARE LICENSE AGREEMENT
+* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
+*
+* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”).
+*
+* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
+* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
+* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
+*
+* 1. DEFINITIONS
+* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE.
+*
+* 2. LICENSE
+* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation.
+* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
+* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
+* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
+*
+* 3. PHONE-HOME FEATURE
+* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation.
+*
+* 4. OWNERSHIP OF INTELLECTUAL PROPERTY
+* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
+* Copyright 2012-2015 Broad Institute, Inc.
+* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
+* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
+*
+* 5. INDEMNIFICATION
+* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
+*
+* 6. NO REPRESENTATIONS OR WARRANTIES
+* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
+* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
+*
+* 7. ASSIGNMENT
+* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
+*
+* 8. MISCELLANEOUS
+* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
+* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
+* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
+* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
+* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
+* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
+* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
+*/
+
+package org.broadinstitute.gatk.tools.walkers.annotator;
+
+import htsjdk.variant.variantcontext.Allele;
+import htsjdk.variant.variantcontext.VariantContext;
+import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.*;
+import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap;
+import org.broadinstitute.gatk.utils.variant.GATKVCFConstants;
+
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+/**
+ * Allele-specific strand bias estimated by the Symmetric Odds Ratio test
+ *
+ */
+public class AS_StrandOddsRatio extends AS_StrandBiasTest implements AS_StandardAnnotation {
+
+ @Override
+ public List getKeyNames() {
+ return Collections.singletonList(GATKVCFConstants.AS_STRAND_ODDS_RATIO_KEY);
+ }
+
+ @Override
+ protected Map calculateAnnotationFromLikelihoodMap(Map stratifiedPerReadAlleleLikelihoodMap,
+ final VariantContext vc){
+ // either SNP with no alignment context, or indels: per-read likelihood map needed
+ final int[][] table = getContingencyTable(stratifiedPerReadAlleleLikelihoodMap, vc, MIN_COUNT);
+ final double ratio = calculateSOR(table);
+ return Collections.singletonMap(getKeyNames().get(0), (Object)String.format("%.3f",ratio));
+ }
+
+ @Override
+ protected Map calculateReducedData(AlleleSpecificAnnotationData> combinedData) {
+ final Map annotationMap = new HashMap<>();
+ final Map> perAlleleData = combinedData.getAttributeMap();
+ final List refStrandCounts = perAlleleData.get(combinedData.getRefAllele());
+ for (final Allele a : perAlleleData.keySet()) {
+ List altStrandCounts = perAlleleData.get(a);
+ int[][] refAltTable = new int[][] {new int[]{refStrandCounts.get(0),refStrandCounts.get(1)},new int[]{altStrandCounts.get(0),altStrandCounts.get(1)}};
+ annotationMap.put(a,calculateSOR(refAltTable));
+ }
+ return annotationMap;
+ }
+
+ /**
+ * Computes the SOR value of a table after augmentation (adding pseudocounts). Based on the symmetric odds ratio but modified to take on
+ * low values when the reference +/- read count ratio is skewed but the alt count ratio is not. Natural log is taken
+ * to keep values within roughly the same range as other annotations.
+ *
+ * Adding pseudocounts prevent divide-by-zero.
+ *
+ * @param originalTable The table before augmentation
+ * @return the SOR annotation value
+ */
+ final protected double calculateSOR(final int[][] originalTable) {
+ final double[][] augmentedTable = StrandBiasTableUtils.augmentContingencyTable(originalTable);
+
+ double ratio = 0;
+
+ ratio += (augmentedTable[0][0] / augmentedTable[0][1]) * (augmentedTable[1][1] / augmentedTable[1][0]);
+ ratio += (augmentedTable[0][1] / augmentedTable[0][0]) * (augmentedTable[1][0] / augmentedTable[1][1]);
+
+ final double refRatio = (Math.min(augmentedTable[0][0], augmentedTable[0][1])/Math.max(augmentedTable[0][0], augmentedTable[0][1]));
+ final double altRatio = (Math.min(augmentedTable[1][0], augmentedTable[1][1])/Math.max(augmentedTable[1][0], augmentedTable[1][1]));
+
+ ratio = ratio*refRatio/altRatio;
+
+ return Math.log(ratio);
+ }
+}
diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AnnotationUtils.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AnnotationUtils.java
index 48119e9a2..843f95108 100644
--- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AnnotationUtils.java
+++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/AnnotationUtils.java
@@ -51,18 +51,40 @@
package org.broadinstitute.gatk.tools.walkers.annotator;
+import htsjdk.samtools.Cigar;
+import htsjdk.samtools.CigarElement;
+import htsjdk.samtools.CigarOperator;
+import htsjdk.samtools.SAMRecord;
import htsjdk.variant.variantcontext.Genotype;
import org.apache.log4j.Logger;
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible;
+import org.broadinstitute.gatk.tools.walkers.indels.PairHMMIndelErrorModel;
+import org.broadinstitute.gatk.tools.walkers.variantutils.CombineGVCFs;
+import org.broadinstitute.gatk.tools.walkers.variantutils.GenotypeGVCFs;
import org.broadinstitute.gatk.utils.exceptions.ReviewedGATKException;
import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap;
import org.broadinstitute.gatk.tools.walkers.haplotypecaller.HaplotypeCaller;
+import org.broadinstitute.gatk.utils.sam.GATKSAMRecord;
public class AnnotationUtils {
- public static final String ANNOTATION_HC_WARN_MSG = " annotation will not be calculated, must be called from HaplotyepCaller";
+ public static final String ANNOTATION_HC_WARN_MSG = " annotation will not be calculated, must be called from HaplotypeCaller";
public static final int WARNINGS_LOGGED_SIZE = 3;
+ /**
+ * Checks if the walker is compatible with allele-specific annotations
+ */
+ public static boolean walkerSupportsAlleleSpecificAnnotations(final AnnotatorCompatible walker) {
+ return ((walker instanceof HaplotypeCaller) || (walker instanceof CombineGVCFs) || (walker instanceof GenotypeGVCFs));
+ }
+
+ /**
+ * Checks if the walker should get raw annotation data
+ */
+ public static boolean walkerRequiresRawData(final AnnotatorCompatible walker) {
+ return ((walker instanceof HaplotypeCaller && ((HaplotypeCaller) walker).emitReferenceConfidence()) || walker instanceof CombineGVCFs);
+ }
+
/**
* Checks if the input data is appropriate
*
@@ -129,4 +151,98 @@ public class AnnotationUtils {
return true;
}
+
+
+ /**
+ * Get the position of a variant within a read with respect to the closer end, accounting for hard clipped bases and low quality ends
+ * Used by ReadPosRankSum annotations
+ *
+ * @param read a read containing the variant
+ * @param initialReadPosition the position based on the modified, post-hard-clipped CIGAR
+ * @return read position
+ */
+ public static int getFinalVariantReadPosition(final GATKSAMRecord read, final int initialReadPosition) {
+ final int numAlignedBases = getNumAlignedBases(read);
+
+ int readPos = initialReadPosition;
+ //TODO: this doesn't work for the middle-right position if we index from zero
+ if (initialReadPosition > numAlignedBases / 2) {
+ readPos = numAlignedBases - (initialReadPosition + 1);
+ }
+ return readPos;
+
+ }
+
+ /**
+ *
+ * @param read a read containing the variant
+ * @return the number of hard clipped and low qual bases at the read start (where start is the leftmost end w.r.t. the reference)
+ */
+ public static int getNumClippedBasesAtStart(final SAMRecord read) {
+ // check for hard clips (never consider these bases):
+ final Cigar c = read.getCigar();
+ final CigarElement first = c.getCigarElement(0);
+
+ int numStartClippedBases = 0;
+ if (first.getOperator() == CigarOperator.H) {
+ numStartClippedBases = first.getLength();
+ }
+ final byte[] unclippedReadBases = read.getReadBases();
+ final byte[] unclippedReadQuals = read.getBaseQualities();
+
+ // Do a stricter base clipping than provided by CIGAR string, since this one may be too conservative,
+ // and may leave a string of Q2 bases still hanging off the reads.
+ //TODO: this code may not even get used because HaplotypeCaller already hard clips low quality tails
+ for (int i = numStartClippedBases; i < unclippedReadBases.length; i++) {
+ if (unclippedReadQuals[i] < PairHMMIndelErrorModel.BASE_QUAL_THRESHOLD)
+ numStartClippedBases++;
+ else
+ break;
+
+ }
+
+ return numStartClippedBases;
+ }
+
+
+ /**
+ *
+ * @param read a read containing the variant
+ * @return number of non-hard clipped, aligned bases (excluding low quality bases at either end)
+ */
+ //TODO: this is bizarre -- this code counts hard clips, but then subtracts them from the read length, which already doesn't count hard clips
+ public static int getNumAlignedBases(final GATKSAMRecord read) {
+ return read.getReadLength() - getNumClippedBasesAtStart(read) - getNumClippedBasesAtEnd(read);
+ }
+
+
+ /**
+ *
+ * @param read a read containing the variant
+ * @return number of hard clipped and low qual bases at the read end (where end is right end w.r.t. the reference)
+ */
+ public static int getNumClippedBasesAtEnd(final GATKSAMRecord read) {
+ // check for hard clips (never consider these bases):
+ final Cigar c = read.getCigar();
+ CigarElement last = c.getCigarElement(c.numCigarElements() - 1);
+
+ int numEndClippedBases = 0;
+ if (last.getOperator() == CigarOperator.H) {
+ numEndClippedBases = last.getLength();
+ }
+ final byte[] unclippedReadBases = read.getReadBases();
+ final byte[] unclippedReadQuals = read.getBaseQualities();
+
+ // Do a stricter base clipping than provided by CIGAR string, since this one may be too conservative,
+ // and may leave a string of Q2 bases still hanging off the reads.
+ //TODO: this code may not even get used because HaplotypeCaller already hard clips low quality tails
+ for (int i = unclippedReadBases.length - numEndClippedBases - 1; i >= 0; i--) {
+ if (unclippedReadQuals[i] < PairHMMIndelErrorModel.BASE_QUAL_THRESHOLD)
+ numEndClippedBases++;
+ else
+ break;
+ }
+
+ return numEndClippedBases;
+ }
}
diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/ChromosomeCounts.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/ChromosomeCounts.java
index c990ffd20..94a04ddfc 100644
--- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/ChromosomeCounts.java
+++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/ChromosomeCounts.java
@@ -90,6 +90,7 @@ import java.util.*;
public class ChromosomeCounts extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation {
private Set founderIds = new HashSet();
+ private boolean didUniquifiedSampleNameCheck = false;
public Map annotate(final RefMetaDataTracker tracker,
final AnnotatorCompatible walker,
@@ -99,6 +100,11 @@ public class ChromosomeCounts extends InfoFieldAnnotation implements StandardAnn
final Map perReadAlleleLikelihoodMap ) {
if ( ! vc.hasGenotypes() )
return null;
+ //if none of the "founders" are in the vc samples, assume we uniquified the samples upstream and they are all founders
+ if (!didUniquifiedSampleNameCheck) {
+ checkSampleNames(vc);
+ didUniquifiedSampleNameCheck = true;
+ }
return VariantContextUtils.calculateChromosomeCounts(vc, new HashMap(), true,founderIds);
}
@@ -113,4 +119,21 @@ public class ChromosomeCounts extends InfoFieldAnnotation implements StandardAnn
}
public List getDescriptions() { return Arrays.asList(ChromosomeCountConstants.descriptions); }
+
+ //this method is intended to reconcile uniquified sample names
+ // it comes into play when calling this annotation from GenotypeGVCFs with --uniquifySamples because founderIds
+ // is derived from the sampleDB, which comes from the input sample names, but vc will have uniquified (i.e. different)
+ // sample names. Without this check, the founderIds won't be found in the vc and the annotation won't be calculated.
+ protected void checkSampleNames(final VariantContext vc) {
+ Set vcSamples = new HashSet<>();
+ vcSamples.addAll(vc.getSampleNames());
+ if (!vcSamples.isEmpty()) {
+ if (founderIds!=null) {
+ vcSamples.retainAll(founderIds);
+ if (vcSamples.isEmpty())
+ founderIds = vc.getSampleNames();
+ }
+ }
+ }
+
}
\ No newline at end of file
diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/DepthPerSampleHC.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/DepthPerSampleHC.java
index 6c769eed7..adb4dbbca 100644
--- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/DepthPerSampleHC.java
+++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/DepthPerSampleHC.java
@@ -55,6 +55,7 @@ import org.apache.commons.lang.StringUtils;
import org.apache.log4j.Logger;
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible;
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.GenotypeAnnotation;
+import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.StandardAnnotation;
import org.broadinstitute.gatk.utils.contexts.AlignmentContext;
import org.broadinstitute.gatk.utils.contexts.ReferenceContext;
import org.broadinstitute.gatk.utils.genotyper.MostLikelyAllele;
diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/FisherStrand.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/FisherStrand.java
index 61cba0875..0f83e6444 100644
--- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/FisherStrand.java
+++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/FisherStrand.java
@@ -51,9 +51,7 @@
package org.broadinstitute.gatk.tools.walkers.annotator;
-import cern.jet.math.Arithmetic;
import htsjdk.variant.variantcontext.GenotypesContext;
-import org.apache.log4j.Logger;
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.ActiveRegionBasedAnnotation;
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.StandardAnnotation;
import org.broadinstitute.gatk.utils.contexts.AlignmentContext;
@@ -90,16 +88,25 @@ import java.util.*;
*/
public class FisherStrand extends StrandBiasTest implements StandardAnnotation, ActiveRegionBasedAnnotation {
private final static boolean ENABLE_DEBUGGING = false;
- private final static Logger logger = Logger.getLogger(FisherStrand.class);
private static final double MIN_PVALUE = 1E-320;
private static final int MIN_QUAL_FOR_FILTERED_TEST = 17;
private static final int MIN_COUNT = ARRAY_DIM;
+ @Override
+ public List getKeyNames() {
+ return Collections.singletonList(GATKVCFConstants.FISHER_STRAND_KEY);
+ }
+
+ @Override
+ public List getDescriptions() {
+ return Collections.singletonList(GATKVCFHeaderLines.getInfoLine(getKeyNames().get(0)));
+ }
+
@Override
protected Map calculateAnnotationFromGTfield(final GenotypesContext genotypes){
final int[][] tableFromPerSampleAnnotations = getTableFromSamples( genotypes, MIN_COUNT );
- return ( tableFromPerSampleAnnotations != null )? pValueForBestTable(tableFromPerSampleAnnotations, null) : null;
+ return ( tableFromPerSampleAnnotations != null )? pValueAnnotationForBestTable(tableFromPerSampleAnnotations, null) : null;
}
@Override
@@ -107,9 +114,11 @@ public class FisherStrand extends StrandBiasTest implements StandardAnnotation,
final VariantContext vc){
final int[][] tableNoFiltering = getSNPContingencyTable(stratifiedContexts, vc.getReference(), vc.getAlternateAlleles(), -1, MIN_COUNT);
final int[][] tableFiltering = getSNPContingencyTable(stratifiedContexts, vc.getReference(), vc.getAlternateAlleles(), MIN_QUAL_FOR_FILTERED_TEST, MIN_COUNT);
- printTable("unfiltered", tableNoFiltering);
- printTable("filtered", tableFiltering);
- return pValueForBestTable(tableFiltering, tableNoFiltering);
+ if (ENABLE_DEBUGGING) {
+ StrandBiasTableUtils.printTable("unfiltered", tableNoFiltering);
+ StrandBiasTableUtils.printTable("filtered", tableFiltering);
+ }
+ return pValueAnnotationForBestTable(tableFiltering, tableNoFiltering);
}
@Override
@@ -119,11 +128,9 @@ public class FisherStrand extends StrandBiasTest implements StandardAnnotation,
final int[][] table = getContingencyTable(stratifiedPerReadAlleleLikelihoodMap, vc, MIN_COUNT);
//logger.info("VC " + vc);
//printTable(table, 0.0);
- return pValueForBestTable(table, null);
+ return pValueAnnotationForBestTable(table, null);
}
-
-
/**
* Create an annotation for the highest (i.e., least significant) p-value of table1 and table2
*
@@ -131,14 +138,14 @@ public class FisherStrand extends StrandBiasTest implements StandardAnnotation,
* @param table2 a contingency table, may be null
* @return annotation result for FS given tables
*/
- private Map pValueForBestTable(final int[][] table1, final int[][] table2) {
+ private Map pValueAnnotationForBestTable(final int[][] table1, final int[][] table2) {
if ( table2 == null )
- return table1 == null ? null : annotationForOneTable(pValueForContingencyTable(table1));
+ return table1 == null ? null : annotationForOneTable(StrandBiasTableUtils.FisherExactPValueForContingencyTable(table1));
else if (table1 == null)
- return annotationForOneTable(pValueForContingencyTable(table2));
+ return annotationForOneTable(StrandBiasTableUtils.FisherExactPValueForContingencyTable(table2));
else { // take the one with the best (i.e., least significant pvalue)
- double pvalue1 = pValueForContingencyTable(table1);
- double pvalue2 = pValueForContingencyTable(table2);
+ double pvalue1 = StrandBiasTableUtils.FisherExactPValueForContingencyTable(table1);
+ double pvalue2 = StrandBiasTableUtils.FisherExactPValueForContingencyTable(table2);
return annotationForOneTable(Math.max(pvalue1, pvalue2));
}
}
@@ -153,185 +160,4 @@ public class FisherStrand extends StrandBiasTest implements StandardAnnotation,
final Object value = String.format("%.3f", QualityUtils.phredScaleErrorRate(Math.max(pValue, MIN_PVALUE))); // prevent INFINITYs
return Collections.singletonMap(getKeyNames().get(0), value);
}
-
- @Override
- public List getKeyNames() {
- return Collections.singletonList(GATKVCFConstants.FISHER_STRAND_KEY);
- }
-
- @Override
- public List getDescriptions() {
- return Collections.singletonList(GATKVCFHeaderLines.getInfoLine(getKeyNames().get(0)));
- }
-
- /**
- * Helper function to turn the FisherStrand table into the SB annotation array
- * @param table the table used by the FisherStrand annotation
- * @return the array used by the per-sample Strand Bias annotation
- */
- public static List getContingencyArray( final int[][] table ) {
- if(table.length != ARRAY_DIM || table[0].length != ARRAY_DIM) {
- logger.warn("Expecting a " + ARRAY_DIM + "x" + ARRAY_DIM + " strand bias table.");
- return null;
- }
-
- final List list = new ArrayList<>(ARRAY_SIZE); // TODO - if we ever want to do something clever with multi-allelic sites this will need to change
- list.add(table[0][0]);
- list.add(table[0][1]);
- list.add(table[1][0]);
- list.add(table[1][1]);
- return list;
- }
-
- public static Double pValueForContingencyTable(int[][] originalTable) {
- final int[][] normalizedTable = normalizeContingencyTable(originalTable);
-
- int[][] table = copyContingencyTable(normalizedTable);
-
- double pCutoff = computePValue(table);
- //printTable(table, pCutoff);
-
- double pValue = pCutoff;
- while (rotateTable(table)) {
- double pValuePiece = computePValue(table);
-
- //printTable(table, pValuePiece);
-
- if (pValuePiece <= pCutoff) {
- pValue += pValuePiece;
- }
- }
-
- table = copyContingencyTable(normalizedTable);
- while (unrotateTable(table)) {
- double pValuePiece = computePValue(table);
-
- //printTable(table, pValuePiece);
-
- if (pValuePiece <= pCutoff) {
- pValue += pValuePiece;
- }
- }
-
- //System.out.printf("P-cutoff: %f\n", pCutoff);
- //System.out.printf("P-value: %f\n\n", pValue);
-
- // min is necessary as numerical precision can result in pValue being slightly greater than 1.0
- return Math.min(pValue, 1.0);
- }
-
- // how large do we want the normalized table to be?
- private static final double TARGET_TABLE_SIZE = 200.0;
-
- /**
- * Normalize the table so that the entries are not too large.
- * Note that this method does NOT necessarily make a copy of the table being passed in!
- *
- * @param table the original table
- * @return a normalized version of the table or the original table if it is already normalized
- */
- private static int[][] normalizeContingencyTable(final int[][] table) {
- final int sum = table[0][0] + table[0][1] + table[1][0] + table[1][1];
- if ( sum <= TARGET_TABLE_SIZE * 2 )
- return table;
-
- final double normalizationFactor = (double)sum / TARGET_TABLE_SIZE;
-
- final int[][] normalized = new int[ARRAY_DIM][ARRAY_DIM];
- for ( int i = 0; i < ARRAY_DIM; i++ ) {
- for ( int j = 0; j < ARRAY_DIM; j++ )
- normalized[i][j] = (int)(table[i][j] / normalizationFactor);
- }
-
- return normalized;
- }
-
- private static int [][] copyContingencyTable(int [][] t) {
- int[][] c = new int[ARRAY_DIM][ARRAY_DIM];
-
- for ( int i = 0; i < ARRAY_DIM; i++ )
- for ( int j = 0; j < ARRAY_DIM; j++ )
- c[i][j] = t[i][j];
-
- return c;
- }
-
-
- private static void printTable(int[][] table, double pValue) {
- logger.info(String.format("%d %d; %d %d : %f", table[0][0], table[0][1], table[1][0], table[1][1], pValue));
- }
-
- /**
- * Printing information to logger.info for debugging purposes
- *
- * @param name the name of the table
- * @param table the table itself
- */
- private void printTable(final String name, final int[][] table) {
- if ( ENABLE_DEBUGGING ) {
- final String pValue = (String)annotationForOneTable(pValueForContingencyTable(table)).get(getKeyNames().get(0));
- logger.info(String.format("FS %s (REF+, REF-, ALT+, ALT-) = (%d, %d, %d, %d) = %s",
- name, table[0][0], table[0][1], table[1][0], table[1][1], pValue));
- }
- }
-
- private static boolean rotateTable(int[][] table) {
- table[0][0]--;
- table[1][0]++;
-
- table[0][1]++;
- table[1][1]--;
-
- return (table[0][0] >= 0 && table[1][1] >= 0);
- }
-
- private static boolean unrotateTable(int[][] table) {
- table[0][0]++;
- table[1][0]--;
-
- table[0][1]--;
- table[1][1]++;
-
- return (table[0][1] >= 0 && table[1][0] >= 0);
- }
-
- private static double computePValue(int[][] table) {
-
- int[] rowSums = { sumRow(table, 0), sumRow(table, 1) };
- int[] colSums = { sumColumn(table, 0), sumColumn(table, 1) };
- int N = rowSums[0] + rowSums[1];
-
- // calculate in log space so we don't die with high numbers
- double pCutoff = Arithmetic.logFactorial(rowSums[0])
- + Arithmetic.logFactorial(rowSums[1])
- + Arithmetic.logFactorial(colSums[0])
- + Arithmetic.logFactorial(colSums[1])
- - Arithmetic.logFactorial(table[0][0])
- - Arithmetic.logFactorial(table[0][1])
- - Arithmetic.logFactorial(table[1][0])
- - Arithmetic.logFactorial(table[1][1])
- - Arithmetic.logFactorial(N);
- return Math.exp(pCutoff);
- }
-
- private static int sumRow(int[][] table, int column) {
- int sum = 0;
- for (int r = 0; r < table.length; r++) {
- sum += table[r][column];
- }
-
- return sum;
- }
-
- private static int sumColumn(int[][] table, int row) {
- int sum = 0;
- for (int c = 0; c < table[row].length; c++) {
- sum += table[row][c];
- }
-
- return sum;
- }
-
-
-
}
diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/RMSAnnotation.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/RMSAnnotation.java
new file mode 100644
index 000000000..6e50d7192
--- /dev/null
+++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/RMSAnnotation.java
@@ -0,0 +1,248 @@
+/*
+* By downloading the PROGRAM you agree to the following terms of use:
+*
+* BROAD INSTITUTE
+* SOFTWARE LICENSE AGREEMENT
+* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
+*
+* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”).
+*
+* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
+* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
+* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
+*
+* 1. DEFINITIONS
+* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE.
+*
+* 2. LICENSE
+* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation.
+* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
+* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
+* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
+*
+* 3. PHONE-HOME FEATURE
+* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation.
+*
+* 4. OWNERSHIP OF INTELLECTUAL PROPERTY
+* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
+* Copyright 2012-2015 Broad Institute, Inc.
+* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
+* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
+*
+* 5. INDEMNIFICATION
+* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
+*
+* 6. NO REPRESENTATIONS OR WARRANTIES
+* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
+* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
+*
+* 7. ASSIGNMENT
+* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
+*
+* 8. MISCELLANEOUS
+* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
+* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
+* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
+* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
+* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
+* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
+* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
+*/
+
+package org.broadinstitute.gatk.tools.walkers.annotator;
+
+import htsjdk.variant.variantcontext.Allele;
+import htsjdk.variant.variantcontext.Genotype;
+import htsjdk.variant.variantcontext.VariantContext;
+import htsjdk.variant.vcf.VCFConstants;
+import htsjdk.variant.vcf.VCFHeaderLine;
+import htsjdk.variant.vcf.VCFInfoHeaderLine;
+import org.broadinstitute.gatk.engine.GenomeAnalysisEngine;
+import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible;
+import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.InfoFieldAnnotation;
+import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.ReducibleAnnotation;
+import org.broadinstitute.gatk.tools.walkers.haplotypecaller.HaplotypeCaller;
+import org.broadinstitute.gatk.tools.walkers.variantutils.CombineGVCFs;
+import org.broadinstitute.gatk.utils.QualityUtils;
+import org.broadinstitute.gatk.utils.contexts.AlignmentContext;
+import org.broadinstitute.gatk.utils.contexts.ReferenceContext;
+import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap;
+import org.broadinstitute.gatk.utils.pileup.PileupElement;
+import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker;
+import org.broadinstitute.gatk.utils.sam.GATKSAMRecord;
+import org.broadinstitute.gatk.utils.variant.GATKVCFHeaderLines;
+
+import java.util.*;
+
+/**
+ * Abstract root for all RankSum-based annotations
+ */
+public abstract class RMSAnnotation extends InfoFieldAnnotation implements ReducibleAnnotation {
+ protected AnnotatorCompatible callingWalker;
+
+ @Override
+ public void initialize(final AnnotatorCompatible walker, final GenomeAnalysisEngine toolkit, final Set headerLines) {
+ callingWalker = walker;
+ }
+
+ @Override
+ public List getDescriptions() {
+ final List headerLines = new ArrayList<>();
+ //ideally only HC in GVCF mode would get the raw header line, but that's a little more complicated
+ if (callingWalker instanceof HaplotypeCaller || callingWalker instanceof CombineGVCFs)
+ headerLines.add(GATKVCFHeaderLines.getInfoLine(getRawKeyName()));
+ headerLines.add(GATKVCFHeaderLines.getInfoLine(getKeyNames().get(0)));
+ return headerLines;
+ }
+
+ @Override
+ public Map annotate(final RefMetaDataTracker tracker,
+ final AnnotatorCompatible walker,
+ final ReferenceContext ref,
+ final Map stratifiedContexts,
+ final VariantContext vc,
+ final Map perReadAlleleLikelihoodMap ) {
+
+ if ( (stratifiedContexts == null || stratifiedContexts.isEmpty()) && perReadAlleleLikelihoodMap == null)
+ return null;
+
+ final Map annotations = new HashMap<>();
+ final ReducibleAnnotationData myData = new ReducibleAnnotationData<>(null);
+ calculateRawData(stratifiedContexts, perReadAlleleLikelihoodMap, myData);
+ final String annotationString = makeFinalizedAnnotationString(vc, myData.getAttributeMap(), stratifiedContexts, perReadAlleleLikelihoodMap);
+ annotations.put(getKeyNames().get(0), annotationString);
+ return annotations;
+ }
+
+ public Map annotateRawData(final RefMetaDataTracker tracker,
+ final AnnotatorCompatible walker,
+ final ReferenceContext ref,
+ final Map stratifiedContexts,
+ final VariantContext vc,
+ final Map perReadAlleleLikelihoodMap ) {
+
+ if ( perReadAlleleLikelihoodMap == null)
+ return new HashMap<>();
+
+ final Map annotations = new HashMap<>();
+ ReducibleAnnotationData myData = new ReducibleAnnotationData<>(null);
+ calculateRawData(vc, perReadAlleleLikelihoodMap, myData);
+ String annotationString = makeRawAnnotationString(vc.getAlleles(), myData.getAttributeMap());
+ annotations.put(getRawKeyName(), annotationString);
+ return annotations;
+ }
+
+ @Override
+ public Map combineRawData(final List vcAlleles, final List extends ReducibleAnnotationData> annotationList) {
+ //VC already contains merged alleles from ReferenceConfidenceVariantContextMerger
+ ReducibleAnnotationData combinedData = new ReducibleAnnotationData(null);
+
+ for (final ReducibleAnnotationData currentValue : annotationList) {
+ parseRawDataString(currentValue);
+ combineAttributeMap(currentValue, combinedData);
+
+ }
+ final Map annotations = new HashMap<>();
+ String annotationString = makeRawAnnotationString(vcAlleles, combinedData.getAttributeMap());
+ annotations.put(getRawKeyName(), annotationString);
+ return annotations;
+ }
+
+ @Override
+ public Map finalizeRawData(final VariantContext vc, final VariantContext originalVC) {
+ if (!vc.hasAttribute(getRawKeyName()))
+ return new HashMap<>();
+ String rawMQdata = vc.getAttributeAsString(getRawKeyName(),null);
+ if (rawMQdata == null)
+ return new HashMap<>();
+
+ ReducibleAnnotationData myData = new ReducibleAnnotationData(rawMQdata);
+ parseRawDataString(myData);
+
+ String annotationString = makeFinalizedAnnotationString(vc, myData.getAttributeMap());
+ return Collections.singletonMap(getKeyNames().get(0), (Object)annotationString);
+ }
+
+ protected void parseRawDataString(ReducibleAnnotationData myData) {
+ final String rawDataString = myData.getRawData();
+ String[] rawMQdataAsStringVector;
+ rawMQdataAsStringVector = rawDataString.split(",");
+ double squareSum = Double.parseDouble(rawMQdataAsStringVector[0]);
+ myData.putAttribute(Allele.NO_CALL, squareSum);
+ }
+
+ public void combineAttributeMap(ReducibleAnnotationData toAdd, ReducibleAnnotationData combined) {
+ if (combined.getAttribute(Allele.NO_CALL) != null)
+ combined.putAttribute(Allele.NO_CALL, (Double) combined.getAttribute(Allele.NO_CALL) + (Double) toAdd.getAttribute(Allele.NO_CALL));
+ else
+ combined.putAttribute(Allele.NO_CALL, toAdd.getAttribute(Allele.NO_CALL));
+
+ }
+
+ //Implementations of this method should return a string consisting of the sum of the squared values for the attribute being annotated (or a delimited list of those if allele-specific)
+ abstract protected String makeRawAnnotationString(List vcAlleles, Map sumOfSquares);
+
+ //Implementations of this method should return a string with the finalized annotation value as will appear in the INFO field
+ abstract protected String makeFinalizedAnnotationString(VariantContext vc, Map sumOfSquares);
+
+ //Implementations of this method should return a string with the finalized annotation value as will appear in the INFO field
+ abstract protected String makeFinalizedAnnotationString(VariantContext vc, Map sumOfSquares, Map stratifiedContexts, final Map perReadAlleleLikelihoodMap);
+
+ protected void calculateRawData(final Map stratifiedContexts,
+ final Map perReadAlleleLikelihoodMap,
+ final ReducibleAnnotationData myData) {
+ if (perReadAlleleLikelihoodMap != null) {
+ calculateRawData((VariantContext) null, perReadAlleleLikelihoodMap, myData);
+ }
+ }
+
+ /**
+ *
+ * @param vc
+ * @param perReadAlleleLikelihoodMap
+ * @param stratifiedContexts
+ * @return the number of reads at the vc position (-1 if all read data is null)
+ */
+ public int getNumOfReads(final VariantContext vc,
+ final Map perReadAlleleLikelihoodMap,
+ final Map stratifiedContexts) {
+ //don't use the full depth because we don't calculate MQ for reference blocks
+ int numOfReads = 0;
+ if(vc.hasAttribute(VCFConstants.DEPTH_KEY)) {
+ numOfReads += Integer.parseInt(vc.getAttributeAsString(VCFConstants.DEPTH_KEY, "-1"));
+ if(vc.hasGenotypes()) {
+ for(Genotype gt : vc.getGenotypes()) {
+ if(gt.isHomRef() && gt.hasExtendedAttribute("MIN_DP")) //site-level DP contribution will come from MIN_DP for gVCF-called reference variants
+ numOfReads -= Integer.parseInt(gt.getExtendedAttribute("MIN_DP").toString());
+ }
+ }
+ return numOfReads;
+ }
+ else if (stratifiedContexts != null && !stratifiedContexts.isEmpty()) {
+ for ( final Map.Entry sample : stratifiedContexts.entrySet() ) {
+ final AlignmentContext context = sample.getValue();
+ for ( final PileupElement p : context.getBasePileup() ) {
+ int mq = p.getRead().getMappingQuality();
+ if ( mq != QualityUtils.MAPPING_QUALITY_UNAVAILABLE ) {
+ numOfReads++;
+ }
+ }
+ }
+ return numOfReads;
+ }
+ else if (perReadAlleleLikelihoodMap != null && !perReadAlleleLikelihoodMap.isEmpty())
+ {
+ for ( final PerReadAlleleLikelihoodMap perReadLikelihoods : perReadAlleleLikelihoodMap.values() ) {
+ for ( final GATKSAMRecord read : perReadLikelihoods.getStoredElements() ) {
+ int mq = read.getMappingQuality();
+ if ( mq != QualityUtils.MAPPING_QUALITY_UNAVAILABLE ) {
+ numOfReads++;
+ }
+ }
+ }
+ return numOfReads;
+ }
+ return -1;
+ }
+
+}
diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/RMSMappingQuality.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/RMSMappingQuality.java
index 534357d43..7d0332e7d 100644
--- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/RMSMappingQuality.java
+++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/RMSMappingQuality.java
@@ -51,22 +51,21 @@
package org.broadinstitute.gatk.tools.walkers.annotator;
-import org.broadinstitute.gatk.utils.contexts.AlignmentContext;
-import org.broadinstitute.gatk.utils.contexts.ReferenceContext;
-import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker;
-import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.ActiveRegionBasedAnnotation;
-import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible;
-import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.InfoFieldAnnotation;
-import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.StandardAnnotation;
-import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap;
-import org.broadinstitute.gatk.utils.MathUtils;
-import org.broadinstitute.gatk.utils.QualityUtils;
+import htsjdk.variant.variantcontext.Allele;
+import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.vcf.VCFConstants;
import htsjdk.variant.vcf.VCFInfoHeaderLine;
import htsjdk.variant.vcf.VCFStandardHeaderLines;
+import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.*;
+import org.broadinstitute.gatk.tools.walkers.haplotypecaller.HaplotypeCaller;
+import org.broadinstitute.gatk.tools.walkers.variantutils.CombineGVCFs;
+import org.broadinstitute.gatk.utils.QualityUtils;
+import org.broadinstitute.gatk.utils.contexts.AlignmentContext;
+import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap;
import org.broadinstitute.gatk.utils.pileup.PileupElement;
import org.broadinstitute.gatk.utils.sam.GATKSAMRecord;
-import htsjdk.variant.variantcontext.VariantContext;
+import org.broadinstitute.gatk.utils.variant.GATKVCFConstants;
+import org.broadinstitute.gatk.utils.variant.GATKVCFHeaderLines;
import java.util.*;
@@ -74,7 +73,9 @@ import java.util.*;
/**
* Root Mean Square of the mapping quality of reads across all samples.
*
- *
This annotation provides an estimation of the overall mapping quality of reads supporting a variant call, averaged over all samples in a cohort.
+ *
This annotation provides an estimation of the overall mapping quality of reads supporting a variant call. It produce both raw data (sum of square and num of total reads) and the calculated root mean square.
+ *
+ * The raw data is used to accurately calculate the root mean square when combining more than one sample.
*
*
Statistical notes
*
The root mean square is equivalent to the mean of the mapping qualities plus the standard deviation of the mapping qualities.
@@ -85,51 +86,89 @@ import java.util.*;
*
*
*/
-public class RMSMappingQuality extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation {
+public class RMSMappingQuality extends RMSAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation, ReducibleAnnotation {
- public Map annotate(final RefMetaDataTracker tracker,
- final AnnotatorCompatible walker,
- final ReferenceContext ref,
- final Map stratifiedContexts,
- final VariantContext vc,
- final Map perReadAlleleLikelihoodMap ) {
+ @Override //this needs an override because MQ is a VCF standard so it's headerline is in a different place
+ public List getDescriptions() {
+ final List headerLines = new ArrayList<>();
+ //only HC in GVCF mode should get the raw header line
+ if ((callingWalker instanceof HaplotypeCaller && ((HaplotypeCaller) callingWalker).emitReferenceConfidence()) || callingWalker instanceof CombineGVCFs)
+ headerLines.add(GATKVCFHeaderLines.getInfoLine(getRawKeyName()));
+ headerLines.add(VCFStandardHeaderLines.getInfoLine(getKeyNames().get(0)));
+ return headerLines;
+ }
- final List qualities = new ArrayList<>();
+ public List getKeyNames() { return Arrays.asList(
+ VCFConstants.RMS_MAPPING_QUALITY_KEY);
+ }
+
+ public String getRawKeyName() { return GATKVCFConstants.RAW_RMS_MAPPING_QUALITY_KEY;}
+
+ @Override
+ public void calculateRawData(final VariantContext vc, final Map pralm, final ReducibleAnnotationData rawAnnotations) {
+ Double squareSum = 0.0;
+ if ( pralm.size() == 0 )
+ return;
+
+ for ( final PerReadAlleleLikelihoodMap perReadLikelihoods : pralm.values() ) {
+ for ( final GATKSAMRecord read : perReadLikelihoods.getStoredElements() ) {
+ int mq = read.getMappingQuality();
+ if ( mq != QualityUtils.MAPPING_QUALITY_UNAVAILABLE ) {
+ squareSum += mq * mq;
+ }
+ }
+ }
+ rawAnnotations.putAttribute(Allele.NO_CALL,squareSum);
+ }
+
+ //this version applies to non-HaplotypeCaller annotators
+ @Override
+ protected void calculateRawData(final Map stratifiedContexts,
+ final Map perReadAlleleLikelihoodMap,
+ final ReducibleAnnotationData myData) {
+
+ Double squareSum = 0.0;
if ( stratifiedContexts != null ) {
if ( stratifiedContexts.size() == 0 )
- return null;
+ return;
for ( final Map.Entry sample : stratifiedContexts.entrySet() ) {
final AlignmentContext context = sample.getValue();
- for ( final PileupElement p : context.getBasePileup() )
- fillMappingQualitiesFromPileup(p.getRead().getMappingQuality(), qualities);
+ for ( final PileupElement p : context.getBasePileup() ) {
+ int mq = p.getRead().getMappingQuality();
+ if ( mq != QualityUtils.MAPPING_QUALITY_UNAVAILABLE ) {
+ squareSum += mq * mq;
+ }
+ }
}
+ myData.putAttribute(Allele.NO_CALL,squareSum);
}
else if (perReadAlleleLikelihoodMap != null) {
- if ( perReadAlleleLikelihoodMap.size() == 0 )
- return null;
-
- for ( final PerReadAlleleLikelihoodMap perReadLikelihoods : perReadAlleleLikelihoodMap.values() ) {
- for ( final GATKSAMRecord read : perReadLikelihoods.getStoredElements() )
- fillMappingQualitiesFromPileup(read.getMappingQuality(), qualities);
- }
- }
- else
- return null;
-
- final double rms = MathUtils.rms(qualities);
- return Collections.singletonMap(getKeyNames().get(0), (Object)String.format("%.2f", rms));
- }
-
- private static void fillMappingQualitiesFromPileup(final int mq, final List qualities) {
- if ( mq != QualityUtils.MAPPING_QUALITY_UNAVAILABLE ) {
- qualities.add(mq);
+ calculateRawData((VariantContext) null, perReadAlleleLikelihoodMap, myData);
}
}
- public List getKeyNames() { return Arrays.asList(VCFConstants.RMS_MAPPING_QUALITY_KEY); }
-
- public List getDescriptions() {
- return Arrays.asList(VCFStandardHeaderLines.getInfoLine(getKeyNames().get(0)));
+ @Override
+ public String makeRawAnnotationString(final List vcAlleles, final Map perAlleleData) {
+ return String.format("%.2f", perAlleleData.get(Allele.NO_CALL));
}
+
+ @Override
+ public String makeFinalizedAnnotationString(final VariantContext vc, final Map perAlleleData, final Map stratifiedContexts, final Map perReadAlleleLikelihoodMap) {
+ if ((stratifiedContexts != null && !stratifiedContexts.isEmpty()) || perReadAlleleLikelihoodMap != null) {
+ int numOfReads = getNumOfReads(vc, perReadAlleleLikelihoodMap, stratifiedContexts);
+ return String.format("%.2f", Math.sqrt((double) perAlleleData.get(Allele.NO_CALL) / numOfReads));
+ }
+ else {
+ return makeFinalizedAnnotationString(vc, perAlleleData);
+ }
+ }
+
+ @Override
+ public String makeFinalizedAnnotationString(final VariantContext vc, final Map perAlleleData) {
+ int numOfReads = getNumOfReads(vc, null, null);
+ return String.format("%.2f", Math.sqrt((double)perAlleleData.get(Allele.NO_CALL)/numOfReads));
+ }
+
+
}
\ No newline at end of file
diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/RankSumTest.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/RankSumTest.java
index 999ebb8cd..4ed64464b 100644
--- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/RankSumTest.java
+++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/RankSumTest.java
@@ -52,12 +52,10 @@
package org.broadinstitute.gatk.tools.walkers.annotator;
import org.broadinstitute.gatk.engine.GenomeAnalysisEngine;
+import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.*;
import org.broadinstitute.gatk.utils.contexts.AlignmentContext;
import org.broadinstitute.gatk.utils.contexts.ReferenceContext;
import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker;
-import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.ActiveRegionBasedAnnotation;
-import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.AnnotatorCompatible;
-import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.InfoFieldAnnotation;
import org.broadinstitute.gatk.utils.genotyper.MostLikelyAllele;
import org.broadinstitute.gatk.utils.genotyper.PerReadAlleleLikelihoodMap;
import org.broadinstitute.gatk.utils.MannWhitneyU;
@@ -76,11 +74,12 @@ import java.util.*;
/**
- * Abstract root for all RankSum based annotations
+ * Abstract root for all RankSum-based annotations
*/
+//TODO: will eventually implement ReducibleAnnotation in order to preserve accuracy for CombineGVCFs and GenotypeGVCFs -- see RMSAnnotation.java for an example of an abstract ReducibleAnnotation
public abstract class RankSumTest extends InfoFieldAnnotation implements ActiveRegionBasedAnnotation {
static final boolean DEBUG = false;
- private boolean useDithering = true;
+ protected boolean useDithering = true;
public Map annotate(final RefMetaDataTracker tracker,
final AnnotatorCompatible walker,
diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/ReadPosRankSumTest.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/ReadPosRankSumTest.java
index c39f872f9..3b33d80c6 100644
--- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/ReadPosRankSumTest.java
+++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/ReadPosRankSumTest.java
@@ -51,12 +51,7 @@
package org.broadinstitute.gatk.tools.walkers.annotator;
-import htsjdk.samtools.Cigar;
-import htsjdk.samtools.CigarElement;
-import htsjdk.samtools.CigarOperator;
-import htsjdk.samtools.SAMRecord;
import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.StandardAnnotation;
-import org.broadinstitute.gatk.tools.walkers.indels.PairHMMIndelErrorModel;
import htsjdk.variant.vcf.VCFInfoHeaderLine;
import org.broadinstitute.gatk.utils.pileup.PileupElement;
import org.broadinstitute.gatk.utils.sam.AlignmentUtils;
@@ -109,7 +104,7 @@ public class ReadPosRankSumTest extends RankSumTest implements StandardAnnotatio
@Override
protected Double getElementForPileupElement(final PileupElement p) {
final int offset = AlignmentUtils.calcAlignmentByteArrayOffset(p.getRead().getCigar(), p, 0, 0);
- return (double)getFinalReadPosition(p.getRead(), offset);
+ return (double)AnnotationUtils.getFinalVariantReadPosition(p.getRead(), offset);
}
@Override
@@ -122,69 +117,5 @@ public class ReadPosRankSumTest extends RankSumTest implements StandardAnnotatio
return super.isUsableRead(read, refLoc) && read.getSoftStart() + read.getCigar().getReadLength() > refLoc;
}
- private int getFinalReadPosition(final GATKSAMRecord read, final int initialReadPosition) {
- final int numAlignedBases = getNumAlignedBases(read);
- int readPos = initialReadPosition;
- if (initialReadPosition > numAlignedBases / 2) {
- readPos = numAlignedBases - (initialReadPosition + 1);
- }
- return readPos;
-
- }
-
- private int getNumClippedBasesAtStart(final SAMRecord read) {
- // compute total number of clipped bases (soft or hard clipped)
- // check for hard clips (never consider these bases):
- final Cigar c = read.getCigar();
- final CigarElement first = c.getCigarElement(0);
-
- int numStartClippedBases = 0;
- if (first.getOperator() == CigarOperator.H) {
- numStartClippedBases = first.getLength();
- }
- final byte[] unclippedReadBases = read.getReadBases();
- final byte[] unclippedReadQuals = read.getBaseQualities();
-
- // Do a stricter base clipping than provided by CIGAR string, since this one may be too conservative,
- // and may leave a string of Q2 bases still hanging off the reads.
- for (int i = numStartClippedBases; i < unclippedReadBases.length; i++) {
- if (unclippedReadQuals[i] < PairHMMIndelErrorModel.BASE_QUAL_THRESHOLD)
- numStartClippedBases++;
- else
- break;
-
- }
-
- return numStartClippedBases;
- }
-
- private int getNumAlignedBases(final GATKSAMRecord read) {
- return read.getReadLength() - getNumClippedBasesAtStart(read) - getNumClippedBasesAtEnd(read);
- }
-
- private int getNumClippedBasesAtEnd(final GATKSAMRecord read) {
- // compute total number of clipped bases (soft or hard clipped)
- // check for hard clips (never consider these bases):
- final Cigar c = read.getCigar();
- CigarElement last = c.getCigarElement(c.numCigarElements() - 1);
-
- int numEndClippedBases = 0;
- if (last.getOperator() == CigarOperator.H) {
- numEndClippedBases = last.getLength();
- }
- final byte[] unclippedReadBases = read.getReadBases();
- final byte[] unclippedReadQuals = read.getBaseQualities();
-
- // Do a stricter base clipping than provided by CIGAR string, since this one may be too conservative,
- // and may leave a string of Q2 bases still hanging off the reads.
- for (int i = unclippedReadBases.length - numEndClippedBases - 1; i >= 0; i--) {
- if (unclippedReadQuals[i] < PairHMMIndelErrorModel.BASE_QUAL_THRESHOLD)
- numEndClippedBases++;
- else
- break;
- }
-
- return numEndClippedBases;
- }
}
diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/StrandBiasBySample.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/StrandBiasBySample.java
index dc08f58c7..0f3496015 100644
--- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/StrandBiasBySample.java
+++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/StrandBiasBySample.java
@@ -52,7 +52,6 @@
package org.broadinstitute.gatk.tools.walkers.annotator;
import org.apache.log4j.Logger;
-import org.apache.commons.lang.StringUtils;
import org.broadinstitute.gatk.utils.contexts.AlignmentContext;
import org.broadinstitute.gatk.utils.contexts.ReferenceContext;
import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker;
@@ -117,7 +116,7 @@ public class StrandBiasBySample extends GenotypeAnnotation {
final int[][] table = FisherStrand.getContingencyTable(Collections.singletonMap(g.getSampleName(), alleleLikelihoodMap), vc, 0);
- gb.attribute(GATKVCFConstants.STRAND_BIAS_BY_SAMPLE_KEY, FisherStrand.getContingencyArray(table));
+ gb.attribute(GATKVCFConstants.STRAND_BIAS_BY_SAMPLE_KEY, StrandBiasTableUtils.getContingencyArray(table));
}
@Override
diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/StrandBiasTableUtils.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/StrandBiasTableUtils.java
new file mode 100644
index 000000000..563bbe155
--- /dev/null
+++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/StrandBiasTableUtils.java
@@ -0,0 +1,250 @@
+/*
+* By downloading the PROGRAM you agree to the following terms of use:
+*
+* BROAD INSTITUTE
+* SOFTWARE LICENSE AGREEMENT
+* FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
+*
+* This Agreement is made between the Broad Institute, Inc. with a principal address at 415 Main Street, Cambridge, MA 02142 (“BROAD”) and the LICENSEE and is effective at the date the downloading is completed (“EFFECTIVE DATE”).
+*
+* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
+* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
+* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
+*
+* 1. DEFINITIONS
+* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK3 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute.org/gatk on the EFFECTIVE DATE.
+*
+* 2. LICENSE
+* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. LICENSEE hereby automatically grants to BROAD a non-exclusive, royalty-free, irrevocable license to any LICENSEE bug fixes or modifications to the PROGRAM with unlimited rights to sublicense and/or distribute. LICENSEE agrees to provide any such modifications and bug fixes to BROAD promptly upon their creation.
+* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
+* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
+* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
+*
+* 3. PHONE-HOME FEATURE
+* LICENSEE expressly acknowledges that the PROGRAM contains an embedded automatic reporting system (“PHONE-HOME”) which is enabled by default upon download. Unless LICENSEE requests disablement of PHONE-HOME, LICENSEE agrees that BROAD may collect limited information transmitted by PHONE-HOME regarding LICENSEE and its use of the PROGRAM. Such information shall include LICENSEE’S user identification, version number of the PROGRAM and tools being run, mode of analysis employed, and any error reports generated during run-time. Collection of such information is used by BROAD solely to monitor usage rates, fulfill reporting requirements to BROAD funding agencies, drive improvements to the PROGRAM, and facilitate adjustments to PROGRAM-related documentation.
+*
+* 4. OWNERSHIP OF INTELLECTUAL PROPERTY
+* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
+* Copyright 2012-2015 Broad Institute, Inc.
+* Notice of attribution: The GATK3 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
+* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
+*
+* 5. INDEMNIFICATION
+* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
+*
+* 6. NO REPRESENTATIONS OR WARRANTIES
+* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
+* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
+*
+* 7. ASSIGNMENT
+* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
+*
+* 8. MISCELLANEOUS
+* 8.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
+* 8.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
+* 8.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
+* 8.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
+* 8.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
+* 8.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
+* 8.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
+*/
+
+package org.broadinstitute.gatk.tools.walkers.annotator;
+
+import cern.jet.math.Arithmetic;
+import org.apache.log4j.Logger;
+import org.broadinstitute.gatk.utils.QualityUtils;
+
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * A class containing many convenience methods used in the strand bias annotation calculations
+ */
+public class StrandBiasTableUtils {
+
+ private final static Logger logger = Logger.getLogger(StrandBiasTableUtils.class);
+
+ //For now this is only for 2x2 contingency tables
+ protected static final int ARRAY_DIM = 2;
+ protected static final int ARRAY_SIZE = ARRAY_DIM * ARRAY_DIM;
+ private static double MIN_PVALUE = 1E-320;
+ // how large do we want the normalized table to be?
+ private static final double TARGET_TABLE_SIZE = 200.0;
+ private final static double AUGMENTATION_CONSTANT = 1.0;
+
+ /**
+ * Computes a two-sided p-Value for a Fisher's exact test on the contingency table, after normalizing counts so that the sum does not exceed {@value org.broadinstitute.gatk.tools.walkers.annotator.StrandBiasTableUtils#TARGET_TABLE_SIZE}
+ * @param originalTable
+ * @return
+ */
+ public static Double FisherExactPValueForContingencyTable(int[][] originalTable) {
+ final int[][] normalizedTable = normalizeContingencyTable(originalTable);
+
+ int[][] table = copyContingencyTable(normalizedTable);
+
+ double pCutoff = computePValue(table);
+
+ double pValue = pCutoff;
+ while (rotateTable(table)) {
+ double pValuePiece = computePValue(table);
+
+ if (pValuePiece <= pCutoff) {
+ pValue += pValuePiece;
+ }
+ }
+
+ table = copyContingencyTable(normalizedTable);
+ while (unrotateTable(table)) {
+ double pValuePiece = computePValue(table);
+
+ if (pValuePiece <= pCutoff) {
+ pValue += pValuePiece;
+ }
+ }
+
+ // min is necessary as numerical precision can result in pValue being slightly greater than 1.0
+ return Math.min(pValue, 1.0);
+ }
+
+ /**
+ * Helper function to turn the FisherStrand table into the SB annotation array
+ * @param table the table used by the FisherStrand annotation
+ * @return the array used by the per-sample Strand Bias annotation
+ */
+ public static List getContingencyArray( final int[][] table ) {
+ if(table.length != ARRAY_DIM || table[0].length != ARRAY_DIM) {
+ logger.warn("Expecting a " + ARRAY_DIM + "x" + ARRAY_DIM + " strand bias table.");
+ return null;
+ }
+
+ final List list = new ArrayList<>(ARRAY_SIZE);
+ list.add(table[0][0]);
+ list.add(table[0][1]);
+ list.add(table[1][0]);
+ list.add(table[1][1]);
+ return list;
+ }
+
+ /**
+ * Printing information to logger.info for debugging purposes
+ *
+ * @param name the name of the table
+ * @param table the table itself
+ */
+ public static void printTable(final String name, final int[][] table) {
+ final String pValue = String.format("%.3f", QualityUtils.phredScaleErrorRate(Math.max(FisherExactPValueForContingencyTable(table), MIN_PVALUE)));
+ logger.info(String.format("FS %s (REF+, REF-, ALT+, ALT-) = (%d, %d, %d, %d) = %s",
+ name, table[0][0], table[0][1], table[1][0], table[1][1], pValue));
+ }
+
+ /**
+ * Adds the small value AUGMENTATION_CONSTANT to all the entries of the table.
+ *
+ * @param table the table to augment
+ * @return the augmented table
+ */
+ protected static double[][] augmentContingencyTable(final int[][] table) {
+ double[][] augmentedTable = new double[ARRAY_DIM][ARRAY_DIM];
+ for ( int i = 0; i < ARRAY_DIM; i++ ) {
+ for ( int j = 0; j < ARRAY_DIM; j++ )
+ augmentedTable[i][j] = table[i][j] + AUGMENTATION_CONSTANT;
+ }
+
+ return augmentedTable;
+ }
+
+ /**
+ * Normalize the table so that the entries are not too large.
+ * Note that this method does NOT necessarily make a copy of the table being passed in!
+ *
+ * @param table the original table
+ * @return a normalized version of the table or the original table if it is already normalized
+ */
+ protected static int[][] normalizeContingencyTable(final int[][] table) {
+ final int sum = table[0][0] + table[0][1] + table[1][0] + table[1][1];
+ if ( sum <= TARGET_TABLE_SIZE * 2 )
+ return table;
+
+ final double normalizationFactor = (double)sum / TARGET_TABLE_SIZE;
+
+ final int[][] normalized = new int[ARRAY_DIM][ARRAY_DIM];
+ for ( int i = 0; i < ARRAY_DIM; i++ ) {
+ for ( int j = 0; j < ARRAY_DIM; j++ )
+ normalized[i][j] = (int)(table[i][j] / normalizationFactor);
+ }
+
+ return normalized;
+ }
+
+ public static int [][] copyContingencyTable(int [][] t) {
+ int[][] c = new int[ARRAY_DIM][ARRAY_DIM];
+
+ for ( int i = 0; i < ARRAY_DIM; i++ ) {
+ //System.arraycopy(t,0,c,0,ARRAY_DIM);
+ for (int j = 0; j < ARRAY_DIM; j++) {
+ c[i][j] = t[i][j];
+ }
+ }
+
+ return c;
+ }
+
+ protected static boolean rotateTable(int[][] table) {
+ table[0][0]--;
+ table[1][0]++;
+
+ table[0][1]++;
+ table[1][1]--;
+
+ return (table[0][0] >= 0 && table[1][1] >= 0);
+ }
+
+ protected static boolean unrotateTable(int[][] table) {
+ table[0][0]++;
+ table[1][0]--;
+
+ table[0][1]--;
+ table[1][1]++;
+
+ return (table[0][1] >= 0 && table[1][0] >= 0);
+ }
+
+ protected static double computePValue(int[][] table) {
+
+ int[] rowSums = { sumRow(table, 0), sumRow(table, 1) };
+ int[] colSums = { sumColumn(table, 0), sumColumn(table, 1) };
+ int N = rowSums[0] + rowSums[1];
+
+ // calculate in log space for better precision
+ double pCutoff = Arithmetic.logFactorial(rowSums[0])
+ + Arithmetic.logFactorial(rowSums[1])
+ + Arithmetic.logFactorial(colSums[0])
+ + Arithmetic.logFactorial(colSums[1])
+ - Arithmetic.logFactorial(table[0][0])
+ - Arithmetic.logFactorial(table[0][1])
+ - Arithmetic.logFactorial(table[1][0])
+ - Arithmetic.logFactorial(table[1][1])
+ - Arithmetic.logFactorial(N);
+ return Math.exp(pCutoff);
+ }
+
+ private static int sumRow(int[][] table, int column) {
+ int sum = 0;
+ for (int r = 0; r < table.length; r++) {
+ sum += table[r][column];
+ }
+
+ return sum;
+ }
+
+ private static int sumColumn(int[][] table, int row) {
+ int sum = 0;
+ for (int c = 0; c < table[row].length; c++) {
+ sum += table[row][c];
+ }
+
+ return sum;
+ }
+
+}
diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/StrandBiasTest.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/StrandBiasTest.java
index 11ee801b8..ac944b0ec 100644
--- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/StrandBiasTest.java
+++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/StrandBiasTest.java
@@ -58,6 +58,7 @@ import htsjdk.variant.vcf.VCFFormatHeaderLine;
import htsjdk.variant.vcf.VCFHeaderLine;
import org.apache.log4j.Logger;
import org.broadinstitute.gatk.engine.GenomeAnalysisEngine;
+import org.broadinstitute.gatk.tools.walkers.annotator.interfaces.ActiveRegionBasedAnnotation;
import org.broadinstitute.gatk.utils.contexts.AlignmentContext;
import org.broadinstitute.gatk.utils.contexts.ReferenceContext;
import org.broadinstitute.gatk.utils.refdata.RefMetaDataTracker;
@@ -77,7 +78,8 @@ import java.util.*;
/**
* Class of tests to detect strand bias.
*/
-public abstract class StrandBiasTest extends InfoFieldAnnotation {
+//TODO: will eventually implement ReducibleAnnotation -- see RMSAnnotation.java for an example of an abstract ReducibleAnnotation
+public abstract class StrandBiasTest extends InfoFieldAnnotation implements ActiveRegionBasedAnnotation {
private final static Logger logger = Logger.getLogger(StrandBiasTest.class);
private static boolean stratifiedPerReadAlleleLikelihoodMapWarningLogged = false;
private static boolean inputVariantContextWarningLogged = false;
@@ -312,7 +314,6 @@ public abstract class StrandBiasTest extends InfoFieldAnnotation {
private static void updateTable(final int[] table, final Allele allele, final GATKSAMRecord read, final Allele ref, final List allAlts) {
final boolean matchesRef = allele.equals(ref, true);
- final boolean matchesAlt = allele.equals(allAlts.get(0), true);
final boolean matchesAnyAlt = allAlts.contains(allele);
if ( matchesRef || matchesAnyAlt ) {
diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/StrandOddsRatio.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/StrandOddsRatio.java
index 35986bdc7..38822a055 100644
--- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/StrandOddsRatio.java
+++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/annotator/StrandOddsRatio.java
@@ -101,7 +101,6 @@ import java.util.*;
*
*/
public class StrandOddsRatio extends StrandBiasTest implements StandardAnnotation, ActiveRegionBasedAnnotation {
- private final static double AUGMENTATION_CONSTANT = 1.0;
private static final int MIN_COUNT = 0;
@Override
@@ -132,17 +131,17 @@ public class StrandOddsRatio extends StrandBiasTest implements StandardAnnotatio
}
/**
- * Computes the SOR value of a table after augmentation. Based on the symmetric odds ratio but modified to take on
+ * Computes the SOR value of a table after augmentation (adding pseudocounts). Based on the symmetric odds ratio but modified to take on
* low values when the reference +/- read count ratio is skewed but the alt count ratio is not. Natural log is taken
* to keep values within roughly the same range as other annotations.
*
- * Augmentation avoids quotient by zero.
+ * Adding pseudocounts prevent divide-by-zero.
*
* @param originalTable The table before augmentation
* @return the SOR annotation value
*/
final protected double calculateSOR(final int[][] originalTable) {
- final double[][] augmentedTable = augmentContingencyTable(originalTable);
+ final double[][] augmentedTable = StrandBiasTableUtils.augmentContingencyTable(originalTable);
double ratio = 0;
@@ -158,22 +157,6 @@ public class StrandOddsRatio extends StrandBiasTest implements StandardAnnotatio
}
- /**
- * Adds the small value AUGMENTATION_CONSTANT to all the entries of the table.
- *
- * @param table the table to augment
- * @return the augmented table
- */
- private static double[][] augmentContingencyTable(final int[][] table) {
- double[][] augmentedTable = new double[ARRAY_DIM][ARRAY_DIM];
- for ( int i = 0; i < ARRAY_DIM; i++ ) {
- for ( int j = 0; j < ARRAY_DIM; j++ )
- augmentedTable[i][j] = table[i][j] + AUGMENTATION_CONSTANT;
- }
-
- return augmentedTable;
- }
-
/**
* Returns an annotation result given a ratio
*
diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCaller.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCaller.java
index e2a1ba850..d37f1946e 100644
--- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCaller.java
+++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCaller.java
@@ -311,7 +311,7 @@ public class HaplotypeCaller extends ActiveRegionWalker, In
* to provide a pedigree file for a pedigree-based annotation) may cause the run to fail.
*/
@Argument(fullName="group", shortName="G", doc="One or more classes/groups of annotations to apply to variant calls", required=false)
- protected String[] annotationClassesToUse = { "Standard" };
+ protected String[] annotationGroupsToUse = { "Standard" };
@ArgumentCollection
private HaplotypeCallerArgumentCollection HCAC = new HaplotypeCallerArgumentCollection();
@@ -623,10 +623,14 @@ public class HaplotypeCaller extends ActiveRegionWalker, In
genotypingEngine = new HaplotypeCallerGenotypingEngine(HCAC, samplesList, genomeLocParser, FixedAFCalculatorProvider.createThreadSafeProvider(getToolkit(), HCAC,logger), !doNotRunPhysicalPhasing);
// initialize the output VCF header
- final VariantAnnotatorEngine annotationEngine = new VariantAnnotatorEngine(Arrays.asList(annotationClassesToUse), annotationsToUse, annotationsToExclude, this, getToolkit());
+ final VariantAnnotatorEngine annotationEngine = new VariantAnnotatorEngine(Arrays.asList(annotationGroupsToUse), annotationsToUse, annotationsToExclude, this, getToolkit());
final Set headerInfo = new HashSet<>();
+ //initialize the annotations (this is particularly important to turn off RankSumTest dithering in integration tests)
+ //do this before we write the header because SnpEff adds to header lines
+ annotationEngine.invokeAnnotationInitializationMethods(headerInfo);
+
headerInfo.addAll(genotypingEngine.getAppropriateVCFInfoHeaders());
// all annotation fields from VariantAnnotatorEngine
headerInfo.addAll(annotationEngine.getVCFAnnotationDescriptions());
@@ -654,9 +658,6 @@ public class HaplotypeCaller extends ActiveRegionWalker, In
vcfWriter.writeHeader(new VCFHeader(headerInfo, sampleSet));
- //now that we have all the VCF headers, initialize the annotations (this is particularly important to turn off RankSumTest dithering in integration tests)
- annotationEngine.invokeAnnotationInitializationMethods(headerInfo);
-
try {
// fasta reference reader to supplement the edges of the reference sequence
referenceReader = new CachingIndexedFastaSequenceFile(getToolkit().getArguments().referenceFile);
@@ -1211,7 +1212,7 @@ public class HaplotypeCaller extends ActiveRegionWalker, In
*
* @return true if HC must emit reference confidence.
*/
- private boolean emitReferenceConfidence() {
+ public boolean emitReferenceConfidence() {
return HCAC.emitReferenceConfidence != ReferenceConfidenceMode.NONE;
}
diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGenotypingEngine.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGenotypingEngine.java
index 1ca9e6c9d..813a18946 100644
--- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGenotypingEngine.java
+++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCallerGenotypingEngine.java
@@ -274,7 +274,7 @@ public class HaplotypeCallerGenotypingEngine extends GenotypingEngine {
+public class CombineGVCFs extends RodWalker implements AnnotatorCompatible {
+
+ /**
+ * Which annotations to recompute for the combined output VCF file.
+ */
+ @Advanced
+ @Argument(fullName="annotation", shortName="A", doc="One or more specific annotations to recompute. The single value 'none' removes the default annotations", required=false)
+ protected List annotationsToUse = new ArrayList<>(Arrays.asList(new String[]{"AS_RMSMappingQuality"}));
+
+ /**
+ * Which groups of annotations to add to the output VCF file. The single value 'none' removes the default group. See
+ * the VariantAnnotator -list argument to view available groups. Note that this usage is not recommended because
+ * it obscures the specific requirements of individual annotations. Any requirements that are not met (e.g. failing
+ * to provide a pedigree file for a pedigree-based annotation) may cause the run to fail.
+ */
+ @Argument(fullName="group", shortName="G", doc="One or more classes/groups of annotations to apply to variant calls", required=false)
+ protected String[] annotationGroupsToUse = { "Standard" };
+
+
+ /**
+ * The rsIDs from this file are used to populate the ID column of the output. Also, the DB INFO flag will be set when appropriate. Note that dbSNP is not used in any way for the calculations themselves.
+ */
+ @ArgumentCollection
+ protected DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection();
+ public RodBinding getDbsnpRodBinding() { return dbsnp.dbsnp; }
+ public List> getCompRodBindings() { return Collections.emptyList(); }
+ public RodBinding getSnpEffRodBinding() { return null; }
+ public List> getResourceRodBindings() { return Collections.emptyList(); }
+ public boolean alwaysAppendDbsnpId() { return false; }
+
+ // the annotation engine
+ private VariantAnnotatorEngine annotationEngine;
protected final class PositionalState {
final List VCs;
@@ -177,6 +211,12 @@ public class CombineGVCFs extends RodWalkeremptyList(), this, getToolkit());
+
+ //now that we have all the VCF headers, initialize the annotations (this is particularly important to turn off RankSumTest dithering in integration tests)
+ annotationEngine.invokeAnnotationInitializationMethods(headerLines);
+
// optimization to prevent mods when we always just want to break bands
if ( multipleAtWhichToBreakBands == 1 )
USE_BP_RESOLUTION = true;
@@ -321,7 +361,7 @@ public class CombineGVCFs extends RodWalker annotationsToUse = new ArrayList<>(Arrays.asList(new String[]{"InbreedingCoeff", "FisherStrand", "QualByDepth", "ChromosomeCounts", "StrandOddsRatio"}));
+ /**
+ * Which groups of annotations to add to the output VCF file. The single value 'none' removes the default group. See
+ * the VariantAnnotator -list argument to view available groups. Note that this usage is not recommended because
+ * it obscures the specific requirements of individual annotations. Any requirements that are not met (e.g. failing
+ * to provide a pedigree file for a pedigree-based annotation) may cause the run to fail.
+ */
+ @Argument(fullName="group", shortName="G", doc="One or more classes/groups of annotations to apply to variant calls", required=false)
+ protected String[] annotationGroupsToUse = {};
+
+
/**
* The rsIDs from this file are used to populate the ID column of the output. Also, the DB INFO flag will be set when appropriate. Note that dbSNP is not used in any way for the calculations themselves.
*/
@@ -211,7 +221,7 @@ public class GenotypeGVCFs extends RodWalkeremptyList(), this, toolkit);
+ annotationEngine = new VariantAnnotatorEngine(Arrays.asList(annotationGroupsToUse), annotationsToUse, Collections.emptyList(), this, toolkit);
// take care of the VCF headers
final Set headerLines = VCFUtils.smartMergeHeaders(vcfRods.values(), true);
@@ -230,6 +240,9 @@ public class GenotypeGVCFs extends RodWalker array ) {
return MathUtils.median(array); // right now we take the median but other options could be explored
}
@@ -89,10 +95,10 @@ public class ReferenceConfidenceVariantContextMerger {
* @return new VariantContext representing the merge of all VCs or null if it not relevant
*/
public static VariantContext merge(final List VCs, final GenomeLoc loc, final Byte refBase, final boolean removeNonRefSymbolicAllele,
- final boolean samplesAreUniquified) {
+ final boolean samplesAreUniquified, final VariantAnnotatorEngine annotatorEngine) {
// this can happen if e.g. you are using a dbSNP file that spans a region with no gVCFs
- if ( VCs == null || VCs.isEmpty() )
- return null;
+ if ( VCs == null || VCs.isEmpty() ) {
+ return null; }
// establish the baseline info (sometimes from the first VC)
final VariantContext first = VCs.get(0);
@@ -100,8 +106,8 @@ public class ReferenceConfidenceVariantContextMerger {
// ref allele
final Allele refAllele = determineReferenceAlleleGivenReferenceBase(VCs, loc, refBase);
- if ( refAllele == null )
- return null;
+ if ( refAllele == null ) {
+ return null; }
// FinalAlleleSet contains the alleles of the new resulting VC
// Using linked set in order to guarantee a stable order
@@ -112,7 +118,7 @@ public class ReferenceConfidenceVariantContextMerger {
final Map attributes = new LinkedHashMap<>();
final Set rsIDs = new LinkedHashSet<>(1); // most of the time there's one id
int depth = 0;
- final Map> annotationMap = new LinkedHashMap<>();
+ final Map> annotationMap = new LinkedHashMap<>();
final GenotypesContext genotypes = GenotypesContext.create();
// In this list we hold the mapping of each variant context alleles.
@@ -156,28 +162,42 @@ public class ReferenceConfidenceVariantContextMerger {
}
}
- if ( loc.getStart() != vc.getStart() )
+ if ( loc.getStart() != vc.getStart() ) {
continue;
+ }
// special case ID (just preserve it)
if ( vc.hasID() ) rsIDs.add(vc.getID());
- // add attributes
- addReferenceConfidenceAttributes(vc.getAttributes(), annotationMap);
+ // add attributes to annotationMap, store all info field annotations as AlleleSpecificAnnotationData in case they can be parsed that way
+ addReferenceConfidenceAttributes(pair, annotationMap);
}
- // when combining annotations use the median value from all input VCs which had annotations provided
- for ( final Map.Entry> p : annotationMap.entrySet() ) {
+ //combine the annotations that are reducible and remove them from annotationMap
+ Map combinedAnnotations = new HashMap<>();
+ if (annotatorEngine != null) {
+ combinedAnnotations = annotatorEngine.combineAnnotations(allelesList, annotationMap);
+ }
+ attributes.putAll(combinedAnnotations);
+
+ // remove stale AC and AF based attributes (including MLEAC and MLEAF lists)
+ //these will be recalculated after genotyping
+ removeStaleAttributesAfterMerge(annotationMap);
+
+ //annotatorEngine.combineAnnotations removed the successfully combined annotations, so now parse those that are left
+ //here we're assuming that things that are left are scalars per sample
+ Map> parsedAnnotationMap = parseRemainingAnnotations(annotationMap);
+
+ // when combining remaining annotations use the median value from all input VCs which had annotations provided
+ for ( final Map.Entry> p : parsedAnnotationMap.entrySet() ) {
if ( ! p.getValue().isEmpty() ) {
attributes.put(p.getKey(), combineAnnotationValues(p.getValue()));
}
}
- if ( depth > 0 )
+ if ( depth > 0 ) {
attributes.put(VCFConstants.DEPTH_KEY, String.valueOf(depth));
-
- // remove stale AC and AF based attributes
- removeStaleAttributesAfterMerge(attributes);
+ }
final String ID = rsIDs.isEmpty() ? VCFConstants.EMPTY_ID_FIELD : Utils.join(",", rsIDs);
@@ -189,6 +209,39 @@ public class ReferenceConfidenceVariantContextMerger {
return builder.make();
}
+ /**
+ * parse the annotations that were not identified as reducible annotations and combined by the annotation engine
+ * @param annotationMap the map of info field annotation names and the list of their data from the merged VCs
+ * @return info field data parsed as ints or doubles
+ */
+ private static Map> parseRemainingAnnotations(final Map> annotationMap) {
+ final Map> parsedAnnotations = new HashMap<>();
+ for (Map.Entry> currentData : annotationMap.entrySet()) {
+ List annotationValues = new ArrayList<>();
+ for (ReducibleAnnotationData value : currentData.getValue()) {
+ try {
+ final String stringValue = value.getRawData();
+ if (stringValue.contains(".")) {
+ annotationValues.add(Double.parseDouble(stringValue));
+ } else if (Character.isDigit(stringValue.charAt(0))){
+ annotationValues.add(Integer.parseInt(stringValue));
+ //TODO: uncomment this to parse dbSNP membership annotation once allele-specific merging for that attribute is added
+ /*} else if (Character.isLetter(stringValue.charAt(0))) {
+ if (stringValue.equalsIgnoreCase("true"))
+ annotationValues.add(true);
+ else if (stringValue.equalsIgnoreCase("false"))
+ annotationValues.add(false);*/
+ }
+
+ } catch (final NumberFormatException e) {
+ logger.warn("WARNING: remaining (non-reducible) annotations are assumed to be ints or doubles or booleans, but " + value.getRawData() + " doesn't parse and will not be annotated in the final VC.");
+ }
+ }
+ parsedAnnotations.put(currentData.getKey(),annotationValues);
+ }
+ return parsedAnnotations;
+ }
+
/**
* @param list the original alleles list
* @return a non-null list of non-symbolic alleles
@@ -196,8 +249,9 @@ public class ReferenceConfidenceVariantContextMerger {
private static List nonSymbolicAlleles(final List list) {
final List result = new ArrayList<>(list.size());
for ( final Allele allele : list ) {
- if ( !allele.isSymbolic() )
+ if ( !allele.isSymbolic() ) {
result.add(allele);
+ }
}
return result;
}
@@ -212,8 +266,9 @@ public class ReferenceConfidenceVariantContextMerger {
*/
private static Allele determineReferenceAlleleGivenReferenceBase(final List VCs, final GenomeLoc loc, final Byte refBase) {
final Allele refAllele = GATKVariantContextUtils.determineReferenceAllele(VCs, loc);
- if ( refAllele == null )
- return ( refBase == null ? null : Allele.create(refBase, true) );
+ if ( refAllele == null ) {
+ return (refBase == null ? null : Allele.create(refBase, true));
+ }
return refAllele;
}
@@ -222,7 +277,7 @@ public class ReferenceConfidenceVariantContextMerger {
*
* @param attributes the attribute map
*/
- private static void removeStaleAttributesAfterMerge(final Map attributes) {
+ private static void removeStaleAttributesAfterMerge(final Map> attributes) {
attributes.remove(VCFConstants.ALLELE_COUNT_KEY);
attributes.remove(VCFConstants.ALLELE_FREQUENCY_KEY);
attributes.remove(VCFConstants.ALLELE_NUMBER_KEY);
@@ -234,31 +289,35 @@ public class ReferenceConfidenceVariantContextMerger {
/**
* Adds attributes to the global map from the new context in a sophisticated manner
*
- * @param myAttributes attributes to add from
+ * @param pair VariantContext/Allele list pair from which to get attributes
* @param annotationMap map of annotations for combining later
*/
- private static void addReferenceConfidenceAttributes(final Map myAttributes,
- final Map> annotationMap) {
+ private static void addReferenceConfidenceAttributes(Pair> pair,
+ final Map> annotationMap) {
+ final Map myAttributes = pair.getFirst().getAttributes(); //these are the info field attributes of the VC in pair
+ final List sampleAlleles = pair.getSecond();
+
for ( final Map.Entry p : myAttributes.entrySet() ) {
final String key = p.getKey();
- final Object value = p.getValue();
+ //allele-specific attributes will always be in list form because they've already been parsed per-allele
+ //non-allele-specific attributes (DP, etc.) will be a list of length 1
+ final List