diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ConsensusAlleleCounter.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ConsensusAlleleCounter.java index 6f16a704f..7457acb22 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ConsensusAlleleCounter.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ConsensusAlleleCounter.java @@ -286,7 +286,7 @@ public class ConsensusAlleleCounter { if (vcs.isEmpty()) return Collections.emptyList(); // nothing else to do, no alleles passed minimum count criterion - final VariantContext mergedVC = GATKVariantContextUtils.simpleMerge(vcs, null, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, GATKVariantContextUtils.GenotypeMergeType.UNSORTED, false, false, null, false, false, false); + final VariantContext mergedVC = GATKVariantContextUtils.simpleMerge(vcs, null, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, GATKVariantContextUtils.GenotypeMergeType.UNSORTED, false, false, null, false, false); return mergedVC.getAlleles(); } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java index 697d162fd..139f2e07d 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java @@ -183,7 +183,7 @@ public class GenotypingEngine { final List priorityList = makePriorityList(eventsAtThisLoc); // Merge the event to find a common reference representation - final VariantContext mergedVC = GATKVariantContextUtils.simpleMerge(eventsAtThisLoc, priorityList, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, false, false, null, false, false, false); + final VariantContext mergedVC = GATKVariantContextUtils.simpleMerge(eventsAtThisLoc, priorityList, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, false, false, null, false, false); if( mergedVC == null ) { continue; } if( eventsAtThisLoc.size() != mergedVC.getAlternateAlleles().size() ) { diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineReferenceCalculationVariants.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineReferenceCalculationVariants.java new file mode 100644 index 000000000..a587b0250 --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineReferenceCalculationVariants.java @@ -0,0 +1,228 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.variantutils; + +import org.broadinstitute.sting.commandline.*; +import org.broadinstitute.sting.gatk.CommandLineGATK; +import org.broadinstitute.sting.gatk.arguments.DbsnpArgumentCollection; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.Reference; +import org.broadinstitute.sting.gatk.walkers.RodWalker; +import org.broadinstitute.sting.gatk.walkers.TreeReducible; +import org.broadinstitute.sting.gatk.walkers.Window; +import org.broadinstitute.sting.gatk.walkers.annotator.ChromosomeCountConstants; +import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; +import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel; +import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedArgumentCollection; +import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.SampleUtils; +import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; +import org.broadinstitute.sting.utils.help.HelpConstants; +import org.broadinstitute.sting.utils.variant.GATKVCFUtils; +import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; +import org.broadinstitute.variant.variantcontext.VariantContext; +import org.broadinstitute.variant.variantcontext.writer.VariantContextWriter; +import org.broadinstitute.variant.vcf.*; + +import java.util.*; + +/** + * Combines gVCF records that were produced by the Haplotype Caller from single sample sources. + * + *

+ * CombineReferenceCalculationVariants combines gVCF records that were produced as part of the "single sample discovery" + * pipeline using the '-ERC GVCF' mode of the Haplotype Caller. This tools performs the multi-sample joint aggregation + * step and merges the records together in a sophisticated manner. + * + * At all positions of the target, this tool will combine all spanning records, produce correct genotype likelihoods, + * re-genotype the newly merged record, and then re-annotate it. + * + * + *

Input

+ *

+ * One or more Haplotype Caller gVCFs to combine. + *

+ * + *

Output

+ *

+ * A combined VCF. + *

+ * + *

Examples

+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
+ *   -T CombineReferenceCalculationVariants \
+ *   --variant input1.vcf \
+ *   --variant input2.vcf \
+ *   -o output.vcf
+ * 
+ * + */ +@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARMANIP, extraDocs = {CommandLineGATK.class} ) +@Reference(window=@Window(start=-10,stop=10)) +public class CombineReferenceCalculationVariants extends RodWalker implements AnnotatorCompatible, TreeReducible { + + // TODO -- allow a file of VCF paths to be entered? + + /** + * The VCF files to merge together + */ + @Input(fullName="variant", shortName = "V", doc="One or more input VCF files", required=true) + public List> variants; + + @Output(doc="File to which variants should be written") + protected VariantContextWriter vcfWriter = null; + + @Argument(fullName="includeNonVariants", shortName="inv", doc="Include loci found to be non-variant after the combining procedure", required=false) + public boolean INCLUDE_NON_VARIANTS = false; + + /** + * Which annotations to recompute for the combined output VCF file. + */ + @Advanced + @Argument(fullName="annotation", shortName="A", doc="One or more specific annotations to recompute", required=false) + protected List annotationsToUse = new ArrayList<>(Arrays.asList(new String[]{"InbreedingCoeff", "FisherStrand", "QualByDepth", "ChromosomeCounts"})); + + /** + * rsIDs from this file are used to populate the ID column of the output. Also, the DB INFO flag will be set when appropriate. + * dbSNP is not used in any way for the calculations themselves. + */ + @ArgumentCollection + protected DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection(); + public RodBinding getDbsnpRodBinding() { return dbsnp.dbsnp; } + + // the genotyping engine + private UnifiedGenotyperEngine genotypingEngine; + // the annotation engine + private VariantAnnotatorEngine annotationEngine; + + public List> getCompRodBindings() { return Collections.emptyList(); } + public RodBinding getSnpEffRodBinding() { return null; } + public List> getResourceRodBindings() { return Collections.emptyList(); } + public boolean alwaysAppendDbsnpId() { return false; } + + + public void initialize() { + // take care of the VCF headers + final Map vcfRods = GATKVCFUtils.getVCFHeadersFromRods(getToolkit()); + final Set samples = SampleUtils.getSampleList(vcfRods, GATKVariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE); + final Set headerLines = VCFUtils.smartMergeHeaders(vcfRods.values(), true); + headerLines.addAll(Arrays.asList(ChromosomeCountConstants.descriptions)); + final VCFHeader vcfHeader = new VCFHeader(headerLines, samples); + vcfWriter.writeHeader(vcfHeader); + + // create the genotyping engine + final UnifiedArgumentCollection UAC = new UnifiedArgumentCollection(); + UAC.GLmodel = GenotypeLikelihoodsCalculationModel.Model.BOTH; + UAC.OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_ALL_SITES; + UAC.GenotypingMode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES; + genotypingEngine = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, null, null, samples, GATKVariantContextUtils.DEFAULT_PLOIDY); + + // create the annotation engine + annotationEngine = new VariantAnnotatorEngine(Arrays.asList("none"), annotationsToUse, Collections.emptyList(), this, getToolkit()); + } + + public VariantContext map(final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context) { + if ( tracker == null ) // RodWalkers can make funky map calls + return null; + + final GenomeLoc loc = ref.getLocus(); + final VariantContext combinedVC = GATKVariantContextUtils.referenceConfidenceMerge(tracker.getPrioritizedValue(variants, loc), loc, INCLUDE_NON_VARIANTS ? ref.getBase() : null); + if ( combinedVC == null ) + return null; + + return regenotypeVC(tracker, ref, combinedVC); + } + + /** + * Re-genotype (and re-annotate) a combined genomic VC + * + * @param tracker the ref tracker + * @param ref the ref context + * @param combinedVC the combined genomic VC + * @return a new VariantContext or null if the site turned monomorphic and we don't want such sites + */ + protected VariantContext regenotypeVC(final RefMetaDataTracker tracker, final ReferenceContext ref, final VariantContext combinedVC) { + if ( combinedVC == null ) throw new IllegalArgumentException("combinedVC cannot be null"); + + VariantContext result = combinedVC; + + // only re-genotype polymorphic sites + if ( combinedVC.isVariant() ) + result = genotypingEngine.calculateGenotypes(result); + + // if it turned monomorphic and we don't want such sites, quit + if ( !INCLUDE_NON_VARIANTS && result.isMonomorphicInSamples() ) + return null; + + // re-annotate it + return annotationEngine.annotateContext(tracker, ref, null, result); + } + + public VariantContextWriter reduceInit() { + return vcfWriter; + } + + public VariantContextWriter reduce(final VariantContext vc, final VariantContextWriter writer) { + if ( vc != null ) + writer.add(vc); + return writer; + } + + @Override + public VariantContextWriter treeReduce(final VariantContextWriter lhs, final VariantContextWriter rhs) { + return lhs; + } + + @Override + public void onTraversalDone(final VariantContextWriter writer) {} +} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineReferenceCalculationVariantsIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineReferenceCalculationVariantsIntegrationTest.java new file mode 100644 index 000000000..7b546b8db --- /dev/null +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineReferenceCalculationVariantsIntegrationTest.java @@ -0,0 +1,72 @@ +/* +* By downloading the PROGRAM you agree to the following terms of use: +* +* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY +* +* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE). +* +* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and +* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions. +* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows: +* +* 1. DEFINITIONS +* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE. +* +* 2. LICENSE +* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM. +* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement. +* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement. +* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM. +* +* 3. OWNERSHIP OF INTELLECTUAL PROPERTY +* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication. +* Copyright 2012 Broad Institute, Inc. +* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc. +* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes. +* +* 4. INDEMNIFICATION +* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement. +* +* 5. NO REPRESENTATIONS OR WARRANTIES +* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME. +* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING. +* +* 6. ASSIGNMENT +* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void. +* +* 7. MISCELLANEOUS +* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries. +* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes. +* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4. +* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt. +* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter. +* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement. +* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles. +*/ + +package org.broadinstitute.sting.gatk.walkers.variantutils; + +import org.broadinstitute.sting.WalkerTest; +import org.testng.annotations.Test; + +import java.util.Arrays; + +public class CombineReferenceCalculationVariantsIntegrationTest extends WalkerTest { + + private static String baseTestString(String args, String ref) { + return "-T CombineReferenceCalculationVariants --no_cmdline_in_header -L 1:1-50,000,000 -o %s -R " + ref + args; + } + + // TODO -- enable this test (and create others) once the Haplotype Caller produces appropriate gVCFs (with for every record) + @Test(enabled = false) + public void combineSingleSamplePipelineGVCF() { + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString(" -V:sample1 " + privateTestDir + "combine.single.sample.pipeline.1.vcf" + + " -V:sample2 " + privateTestDir + "combine.single.sample.pipeline.2.vcf" + + " -V:sample3 " + privateTestDir + "combine.single.sample.pipeline.3.vcf" + + " -L 20:10,000,000-10,001,000", b37KGReference), + 1, + Arrays.asList("0413f0725fc5ec3a4f1ee246f6cb3a2a")); + executeTest("combineSingleSamplePipelineGVCF", spec); + } +} \ No newline at end of file diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsIntegrationTest.java index 2eeb9221e..fb54ab400 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsIntegrationTest.java @@ -185,19 +185,6 @@ public class CombineVariantsIntegrationTest extends WalkerTest { @Test public void complexTestSitesOnly() { combineComplexSites(" -sites_only", "46bbbbb8fc9ae6467a4f8fe35b8d7d14"); } @Test public void complexTestSitesOnlyMinimal() { combineComplexSites(" -sites_only -minimalVCF", "46bbbbb8fc9ae6467a4f8fe35b8d7d14"); } - @Test public void combineSingleSamplePipelineGVCF() { - WalkerTestSpec spec = new WalkerTestSpec( - baseTestString(" -V:sample1 " + privateTestDir + "combine.single.sample.pipeline.1.vcf" + - " -V:sample2 " + privateTestDir + "combine.single.sample.pipeline.2.vcf" + - " -V:sample3 " + privateTestDir + "combine.single.sample.pipeline.3.vcf" + - " -multipleAllelesMergeType MIX_TYPES" + - " --excludeNonVariants -combineAnnotations -setKey null" + - " -L 20:10,000,000-10,001,000", b37KGReference), - 1, - Arrays.asList("0413f0725fc5ec3a4f1ee246f6cb3a2a")); - cvExecuteTest("combineSingleSamplePipelineGVCF", spec, true); - } - @Test public void combineDBSNPDuplicateSites() { WalkerTestSpec spec = new WalkerTestSpec( diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/RefMetaDataTracker.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/RefMetaDataTracker.java index 5a1b015fe..e194a9c43 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/refdata/RefMetaDataTracker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/refdata/RefMetaDataTracker.java @@ -170,7 +170,36 @@ public class RefMetaDataTracker { @Requires({"type != null", "onlyAtThisLoc != null"}) public T getFirstValue(final Class type, final GenomeLoc onlyAtThisLoc) { return safeGetFirst(getValues(type, onlyAtThisLoc)); + } + /** + * Same logic as @link #getFirstValue(RodBinding, boolean) but prioritizes records from prioritizeThisLoc if available + * + * @param rodBindings Only Features coming from the tracks associated with one of rodBindings are fetched + * @param The Tribble Feature type of the rodBinding, and consequently the type of the resulting list of Features + * @param prioritizeThisLoc only Features starting at this site are considered + * @return A freshly allocated list of all of the bindings, or an empty list if none are bound. + */ + @Requires({"rodBindings != null", "prioritizeThisLoc != null"}) + @Ensures("result != null") + public List getPrioritizedValue(final Collection> rodBindings, final GenomeLoc prioritizeThisLoc) { + final List results = new ArrayList<>(); + + for ( final RodBinding rodBinding : rodBindings ) { + + // if there's a value at the prioritized location, take it + T value = getFirstValue(rodBinding, prioritizeThisLoc); + + // otherwise, grab any one + if ( value == null ) + value = getFirstValue(rodBinding); + + // add if not null + if ( value != null ) + results.add(value); + } + + return results; } /** diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java index 1362b109e..e13252d49 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java @@ -191,9 +191,6 @@ public class CombineVariants extends RodWalker implements Tree @Argument(fullName="mergeInfoWithMaxAC", shortName="mergeInfoWithMaxAC", doc="If true, when VCF records overlap the info field is taken from the one with the max AC instead of only taking the fields which are identical across the overlapping records.", required=false) public boolean MERGE_INFO_WITH_MAX_AC = false; - @Argument(fullName="combineAnnotations", shortName="combineAnnotations", doc="If true, combine the annotation values in some straightforward manner assuming the input callsets are i.i.d.", required=false) - public boolean COMBINE_ANNOTATIONS = false; - private List priority = null; /** Optimization to strip out genotypes before merging if we are doing a sites_only output */ @@ -260,12 +257,9 @@ public class CombineVariants extends RodWalker implements Tree // get all of the vcf rods at this locus // Need to provide reference bases to simpleMerge starting at current locus Collection vcs = tracker.getValues(variants, context.getLocation()); - Collection potentialRefVCs = tracker.getValues(variants); - potentialRefVCs.removeAll(vcs); if ( sitesOnlyVCF ) { vcs = VariantContextUtils.sitesOnlyVariantContexts(vcs); - potentialRefVCs = VariantContextUtils.sitesOnlyVariantContexts(potentialRefVCs); } if ( ASSUME_IDENTICAL_SAMPLES ) { @@ -307,17 +301,16 @@ public class CombineVariants extends RodWalker implements Tree // make sure that it is a variant or in case it is not, that we want to include the sites with no variants if (!EXCLUDE_NON_VARIANTS || !type.equals(VariantContext.Type.NO_VARIATION)) { if (VCsByType.containsKey(type)) { - mergedVCs.add(GATKVariantContextUtils.simpleMerge(VCsByType.get(type), potentialRefVCs, - priority, rodNames.size(), filteredRecordsMergeType, genotypeMergeOption, true, printComplexMerges, - SET_KEY, filteredAreUncalled, MERGE_INFO_WITH_MAX_AC, COMBINE_ANNOTATIONS)); + mergedVCs.add(GATKVariantContextUtils.simpleMerge(VCsByType.get(type), priority, rodNames.size(), + filteredRecordsMergeType, genotypeMergeOption, true, printComplexMerges, + SET_KEY, filteredAreUncalled, MERGE_INFO_WITH_MAX_AC)); } } } } else if (multipleAllelesMergeType == GATKVariantContextUtils.MultipleAllelesMergeType.MIX_TYPES) { - mergedVCs.add(GATKVariantContextUtils.simpleMerge(vcs, potentialRefVCs, - priority, rodNames.size(), filteredRecordsMergeType, genotypeMergeOption, true, printComplexMerges, - SET_KEY, filteredAreUncalled, MERGE_INFO_WITH_MAX_AC, COMBINE_ANNOTATIONS)); + mergedVCs.add(GATKVariantContextUtils.simpleMerge(vcs, priority, rodNames.size(), filteredRecordsMergeType, + genotypeMergeOption, true, printComplexMerges, SET_KEY, filteredAreUncalled, MERGE_INFO_WITH_MAX_AC)); } else { logger.warn("Ignoring all records at site " + ref.getLocus()); diff --git a/public/java/src/org/broadinstitute/sting/utils/variant/GATKVariantContextUtils.java b/public/java/src/org/broadinstitute/sting/utils/variant/GATKVariantContextUtils.java index 1ae34e268..e0c32d5d5 100644 --- a/public/java/src/org/broadinstitute/sting/utils/variant/GATKVariantContextUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/variant/GATKVariantContextUtils.java @@ -790,7 +790,6 @@ public class GATKVariantContextUtils { * @param setKey the key name of the set * @param filteredAreUncalled are filtered records uncalled? * @param mergeInfoWithMaxAC should we merge in info from the VC with maximum allele count? - * @param combineAnnotations should we merge info field annotations by assuming the incoming VCs are i.i.d. * @return new VariantContext representing the merge of unsortedVCs */ public static VariantContext simpleMerge(final Collection unsortedVCs, @@ -801,10 +800,9 @@ public class GATKVariantContextUtils { final boolean printMessages, final String setKey, final boolean filteredAreUncalled, - final boolean mergeInfoWithMaxAC, - final boolean combineAnnotations ) { + final boolean mergeInfoWithMaxAC ) { int originalNumOfVCs = priorityListOfVCs == null ? 0 : priorityListOfVCs.size(); - return simpleMerge(unsortedVCs, Collections.emptyList(), priorityListOfVCs, originalNumOfVCs, filteredRecordMergeType, genotypeMergeOptions, annotateOrigin, printMessages, setKey, filteredAreUncalled, mergeInfoWithMaxAC, combineAnnotations); + return simpleMerge(unsortedVCs, priorityListOfVCs, originalNumOfVCs, filteredRecordMergeType, genotypeMergeOptions, annotateOrigin, printMessages, setKey, filteredAreUncalled, mergeInfoWithMaxAC); } /** @@ -817,7 +815,6 @@ public class GATKVariantContextUtils { * For more information on this method see: http://www.thedistractionnetwork.com/programmer-problem/ * * @param unsortedVCs collection of unsorted VCs - * @param potentialRefVCs collection of unsorted VCs that overlap this locus which should only be searched for potential reference records * @param priorityListOfVCs priority list detailing the order in which we should grab the VCs * @param filteredRecordMergeType merge type for filtered records * @param genotypeMergeOptions merge option for genotypes @@ -826,11 +823,9 @@ public class GATKVariantContextUtils { * @param setKey the key name of the set * @param filteredAreUncalled are filtered records uncalled? * @param mergeInfoWithMaxAC should we merge in info from the VC with maximum allele count? - * @param combineAnnotations should we merge info field annotations by assuming the incoming VCs are i.i.d. * @return new VariantContext representing the merge of unsortedVCs */ public static VariantContext simpleMerge(final Collection unsortedVCs, - final Collection potentialRefVCs, final List priorityListOfVCs, final int originalNumOfVCs, final FilteredRecordMergeType filteredRecordMergeType, @@ -839,9 +834,7 @@ public class GATKVariantContextUtils { final boolean printMessages, final String setKey, final boolean filteredAreUncalled, - final boolean mergeInfoWithMaxAC, - final boolean combineAnnotations ) { - + final boolean mergeInfoWithMaxAC ) { if ( unsortedVCs == null || unsortedVCs.size() == 0 ) return null; @@ -860,9 +853,6 @@ public class GATKVariantContextUtils { VCs.add(vc); } - // cycle through and fill in NON_REF_SYMBOLIC_ALLELEs with the actual alternate allele if possible - VCs = fillInNonRefSymbolicAlleles(VCs, potentialRefVCs); - if ( VCs.size() == 0 ) // everything is filtered out and we're filteredAreUncalled return null; @@ -950,38 +940,19 @@ public class GATKVariantContextUtils { for (final Map.Entry p : vc.getAttributes().entrySet()) { final String key = p.getKey(); final Object value = p.getValue(); - boolean badAnnotation = false; - if ( combineAnnotations ) { // add the annotation values to a list for combining later - List values = annotationMap.get(key); - if( values == null ) { - values = new ArrayList<>(); - annotationMap.put(key, values); - } - try { - final String stringValue = value.toString(); - // Branch to avoid unintentional, implicit type conversions that occur with the ? operator. - if (stringValue.contains(".")) - values.add(Double.parseDouble(stringValue)); - else - values.add(Integer.parseInt(stringValue)); - } catch (NumberFormatException e) { - badAnnotation = true; - } - } - if ( ! combineAnnotations || badAnnotation ) { // only output annotations that have the same value in every input VC - // if we don't like the key already, don't go anywhere - if ( ! inconsistentAttributes.contains(key) ) { - final boolean alreadyFound = attributes.containsKey(key); - final Object boundValue = attributes.get(key); - final boolean boundIsMissingValue = alreadyFound && boundValue.equals(VCFConstants.MISSING_VALUE_v4); + // only output annotations that have the same value in every input VC + // if we don't like the key already, don't go anywhere + if ( ! inconsistentAttributes.contains(key) ) { + final boolean alreadyFound = attributes.containsKey(key); + final Object boundValue = attributes.get(key); + final boolean boundIsMissingValue = alreadyFound && boundValue.equals(VCFConstants.MISSING_VALUE_v4); - if ( alreadyFound && ! boundValue.equals(value) && ! boundIsMissingValue ) { - // we found the value but we're inconsistent, put it in the exclude list - inconsistentAttributes.add(key); - attributes.remove(key); - } else if ( ! alreadyFound || boundIsMissingValue ) { // no value - attributes.put(key, value); - } + if ( alreadyFound && ! boundValue.equals(value) && ! boundIsMissingValue ) { + // we found the value but we're inconsistent, put it in the exclude list + inconsistentAttributes.add(key); + attributes.remove(key); + } else if ( ! alreadyFound || boundIsMissingValue ) { // no value + attributes.put(key, value); } } } @@ -1007,12 +978,6 @@ public class GATKVariantContextUtils { // take the VC with the maxAC and pull the attributes into a modifiable map if ( mergeInfoWithMaxAC && vcWithMaxAC != null ) { attributesWithMaxAC.putAll(vcWithMaxAC.getAttributes()); - } else if ( combineAnnotations ) { // when combining annotations use the median value from all input VCs which had annotations provided - for ( final Map.Entry> p : annotationMap.entrySet() ) { - if ( ! p.getValue().isEmpty() ) { - attributes.put(p.getKey(), combineAnnotationValues(p.getValue())); - } - } } // if at least one record was unfiltered and we want a union, clear all of the filters @@ -1058,11 +1023,6 @@ public class GATKVariantContextUtils { builder.filters(filters.isEmpty() ? filters : new TreeSet<>(filters)); } builder.attributes(new TreeMap<>(mergeInfoWithMaxAC ? attributesWithMaxAC : attributes)); - if( combineAnnotations ) { - // unfortunately some attributes are just too dangerous to try to combine together - builder.rmAttribute(VCFConstants.MLE_ALLELE_COUNT_KEY); - builder.rmAttribute(VCFConstants.MLE_ALLELE_FREQUENCY_KEY); - } // Trim the padded bases of all alleles if necessary final VariantContext merged = builder.make(); @@ -1070,69 +1030,202 @@ public class GATKVariantContextUtils { return merged; } - private static final Comparable combineAnnotationValues( final List array ) { + private static Comparable combineAnnotationValues( final List array ) { return MathUtils.median(array); // right now we take the median but other options could be explored } /** - * cycle through and fill in NON_REF_SYMBOLIC_ALLELEs with the actual alternate allele if possible - * @param VCs the list of VCs in which to fill in symbolic alleles - * @param potentialRefVCs the list of VCs which are overlapping the current locus-- need to look for reference blocks and fill in with alternate alleles - * @return the list of VCs to merge in which all the NON_REF_SYMBOLIC_ALLELEs have been replaced with the correct alternate allele + * Merges VariantContexts from gVCFs into a single hybrid. + * Assumes that none of the input records are filtered. + * + * @param VCs collection of unsorted genomic VCs + * @param loc the current location + * @param refBase the reference allele to use if all contexts in the VC are spanning (i.e. don't start at the location in loc); if null, we'll return null in this case + * @return new VariantContext representing the merge of all VCs or null if it not relevant */ - protected static final List fillInNonRefSymbolicAlleles( final List VCs, final Collection potentialRefVCs ) { - if( VCs == null ) { throw new IllegalArgumentException("VCs cannot be null"); } - if( potentialRefVCs == null ) { throw new IllegalArgumentException("potentialRefVCs cannot be null"); } + public static VariantContext referenceConfidenceMerge(final List VCs, final GenomeLoc loc, final Byte refBase) { - final List VCsToReturn = new ArrayList<>(VCs.size()); - boolean containsNonRefSymbolicAllele = false; - VariantContext nonRefVC = null; - for( final VariantContext vc : VCs ) { - if( vc.hasAllele(NON_REF_SYMBOLIC_ALLELE) ) { - containsNonRefSymbolicAllele = true; - } else if ( nonRefVC == null ) { - nonRefVC = vc; - } - if( nonRefVC != null && containsNonRefSymbolicAllele == true ) { - break; // break out so that we don't run over the whole list unnecessarily - } + if ( VCs == null || VCs.size() == 0 ) throw new IllegalArgumentException("VCs cannot be null or empty"); + + // establish the baseline info (sometimes from the first VC) + final VariantContext first = VCs.get(0); + final String name = first.getSource(); + + // ref allele + final Allele refAllele = determineReferenceAlleleGiveReferenceBase(VCs, loc, refBase); + if ( refAllele == null ) + return null; + + // alt alleles + final AlleleMapper alleleMapper = determineAlternateAlleleMapping(VCs, refAllele, loc); + final List alleles = getAllelesListFromMapper(refAllele, alleleMapper); + + final Map attributes = new LinkedHashMap<>(); + final Set inconsistentAttributes = new HashSet<>(); + final Set rsIDs = new LinkedHashSet<>(1); // most of the time there's one id + + VariantContext longestVC = first; + int depth = 0; + final Map> annotationMap = new LinkedHashMap<>(); + GenotypesContext genotypes = GenotypesContext.create(); + + // cycle through and add info from the other VCs + for ( final VariantContext vc : VCs ) { + + // if this context doesn't start at the current location then it must be a spanning event (deletion or ref block) + final boolean isSpanningEvent = loc.getStart() != vc.getStart(); + final List remappedAlleles = isSpanningEvent ? replaceWithNoCalls(vc.getAlleles()) : alleleMapper.remap(vc.getAlleles()); + mergeRefConfidenceGenotypes(genotypes, vc, remappedAlleles, alleles); + + // special case DP (add it up) for all events + if ( vc.hasAttribute(VCFConstants.DEPTH_KEY) ) + depth += vc.getAttributeAsInt(VCFConstants.DEPTH_KEY, 0); + + if ( isSpanningEvent ) + continue; + + // keep track of the longest location that starts here + if ( VariantContextUtils.getSize(vc) > VariantContextUtils.getSize(longestVC) ) + longestVC = vc; + + // special case ID (just preserve it) + if ( vc.hasID() ) rsIDs.add(vc.getID()); + + // add attributes + addReferenceConfidenceAttributes(vc.getAttributes(), attributes, inconsistentAttributes, annotationMap); } - for( final VariantContext vc : potentialRefVCs ) { - if( vc.hasAllele(NON_REF_SYMBOLIC_ALLELE) ) { - containsNonRefSymbolicAllele = true; - VCs.add(vc); // add the overlapping non-ref symbolic records to the VCs list in order to be filled in below + + // when combining annotations use the median value from all input VCs which had annotations provided + for ( final Map.Entry> p : annotationMap.entrySet() ) { + if ( ! p.getValue().isEmpty() ) { + attributes.put(p.getKey(), combineAnnotationValues(p.getValue())); } } - if( !containsNonRefSymbolicAllele ) { - return VCs; - } + if ( depth > 0 ) + attributes.put(VCFConstants.DEPTH_KEY, String.valueOf(depth)); - for( final VariantContext vc : VCs ) { - if( vc.hasAllele(NON_REF_SYMBOLIC_ALLELE) ) { // create a new record based on the current record but instead has the symbolic allele replaced by the alternate allele for this site - if( nonRefVC != null ) { - final GenotypesContext genotypes = GenotypesContext.create(vc.getSampleNames().size()); - int depth = 0; - for( final String sample : vc.getSampleNames() ) { - final Genotype gt = vc.getGenotype(sample); - final ArrayList refAlleles = new ArrayList<>(2); - refAlleles.add(nonRefVC.getReference()); - refAlleles.add(nonRefVC.getReference()); - final int[] pl = ( nonRefVC.isBiallelic() ? gt.getPL() : null ); // PLs only works for biallelic sites for now - depth += ( gt.hasDP() ? gt.getDP() : Integer.parseInt((String)gt.getAnyAttribute("MIN_DP")) ); // DP is special-cased in CombineVariants so fill it in here - genotypes.add(new GenotypeBuilder(gt).alleles(refAlleles).PL(pl).make()); - } - VCsToReturn.add(new VariantContextBuilder(nonRefVC).attributes(null).attribute("DP", depth).genotypes(genotypes).make()); - } - } else { - VCsToReturn.add(vc); - } - } + final String ID = rsIDs.isEmpty() ? VCFConstants.EMPTY_ID_FIELD : Utils.join(",", rsIDs); - return VCsToReturn; + final VariantContextBuilder builder = new VariantContextBuilder().source(name).id(ID).alleles(alleles) + .loc(longestVC.getChr(), longestVC.getStart(), longestVC.getEnd()) + .genotypes(genotypes).unfiltered().attributes(new TreeMap<>(attributes)).log10PError(CommonInfo.NO_LOG10_PERROR); // we will need to regenotype later + + // remove stale AC and AF based attributes + removeStaleAttributesAfterMerge(builder); + + return builder.make(); } - private static final boolean hasPLIncompatibleAlleles(final Collection alleleSet1, final Collection alleleSet2) { + /** + * Determines the ref allele given the provided reference base at this position + * + * @param VCs collection of unsorted genomic VCs + * @param loc the current location + * @param refBase the reference allele to use if all contexts in the VC are spanning + * @return new Allele or null if no reference allele/base is available + */ + private static Allele determineReferenceAlleleGiveReferenceBase(final List VCs, final GenomeLoc loc, final Byte refBase) { + final Allele refAllele = determineReferenceAllele(VCs, loc); + if ( refAllele == null ) + return ( refBase == null ? null : Allele.create(refBase, true) ); + return refAllele; + } + + /** + * Creates an alleles list given a reference allele and a mapper + * + * @param refAllele the reference allele + * @param alleleMapper the allele mapper + * @return a non-null, non-empty list of Alleles + */ + private static List getAllelesListFromMapper(final Allele refAllele, final AlleleMapper alleleMapper) { + final List alleles = new ArrayList<>(); + alleles.add(refAllele); + alleles.addAll(alleleMapper.getUniqueMappedAlleles()); + return alleles; + } + + /** + * Remove the stale attributes from the merged VariantContext (builder) + * + * @param builder the VC builder + */ + private static void removeStaleAttributesAfterMerge(final VariantContextBuilder builder) { + builder.rmAttribute(VCFConstants.ALLELE_COUNT_KEY); + builder.rmAttribute(VCFConstants.ALLELE_FREQUENCY_KEY); + builder.rmAttribute(VCFConstants.ALLELE_NUMBER_KEY); + builder.rmAttribute(VCFConstants.MLE_ALLELE_COUNT_KEY); + builder.rmAttribute(VCFConstants.MLE_ALLELE_FREQUENCY_KEY); + } + + /** + * Adds attributes to the global map from the new context in a sophisticated manner + * + * @param myAttributes attributes to add from + * @param globalAttributes global set of attributes to add to + * @param inconsistentAttributes set of attributes that are inconsistent among samples + * @param annotationMap map of annotations for combining later + */ + private static void addReferenceConfidenceAttributes(final Map myAttributes, + final Map globalAttributes, + final Set inconsistentAttributes, + final Map> annotationMap) { + for ( final Map.Entry p : myAttributes.entrySet() ) { + final String key = p.getKey(); + final Object value = p.getValue(); + boolean badAnnotation = false; + + // add the annotation values to a list for combining later + List values = annotationMap.get(key); + if( values == null ) { + values = new ArrayList<>(); + annotationMap.put(key, values); + } + try { + final String stringValue = value.toString(); + // Branch to avoid unintentional, implicit type conversions that occur with the ? operator. + if (stringValue.contains(".")) + values.add(Double.parseDouble(stringValue)); + else + values.add(Integer.parseInt(stringValue)); + } catch (final NumberFormatException e) { + badAnnotation = true; + } + + // only output annotations that have the same value in every input VC + if ( badAnnotation && ! inconsistentAttributes.contains(key) ) { + checkForConsistency(key, value, globalAttributes, inconsistentAttributes); + } + } + } + + /** + * Check attributes for consistency to others in the merge + * + * @param key the attribute key + * @param value the attribute value + * @param globalAttributes the global list of attributes being merged + * @param inconsistentAttributes the list of inconsistent attributes in the merge + */ + private static void checkForConsistency(final String key, + final Object value, + final Map globalAttributes, + final Set inconsistentAttributes) { + final boolean alreadyFound = globalAttributes.containsKey(key); + final Object boundValue = globalAttributes.get(key); + final boolean boundIsMissingValue = alreadyFound && boundValue.equals(VCFConstants.MISSING_VALUE_v4); + + if ( alreadyFound && ! boundValue.equals(value) && ! boundIsMissingValue ) { + // we found the value but we're inconsistent, put it in the exclude list + inconsistentAttributes.add(key); + globalAttributes.remove(key); + } else if ( ! alreadyFound || boundIsMissingValue ) { // no value + globalAttributes.put(key, value); + } + } + + private static boolean hasPLIncompatibleAlleles(final Collection alleleSet1, final Collection alleleSet2) { final Iterator it1 = alleleSet1.iterator(); final Iterator it2 = alleleSet2.iterator(); @@ -1262,65 +1355,130 @@ public class GATKVariantContextUtils { return builder.make(); } - static private Allele determineReferenceAllele(List VCs) { + static private Allele determineReferenceAllele(final List VCs) { + return determineReferenceAllele(VCs, null); + } + + /** + * Determines the common reference allele + * + * @param VCs the list of VariantContexts + * @param loc if not null, ignore records that do not begin at this start location + * @return possibly null Allele + */ + static private Allele determineReferenceAllele(final List VCs, final GenomeLoc loc) { Allele ref = null; for ( final VariantContext vc : VCs ) { - final Allele myRef = vc.getReference(); - if ( ref == null || ref.length() < myRef.length() ) - ref = myRef; - else if ( ref.length() == myRef.length() && ! ref.equals(myRef) ) - throw new TribbleException(String.format("The provided variant file(s) have inconsistent references for the same position(s) at %s:%d, %s vs. %s", vc.getChr(), vc.getStart(), ref, myRef)); + if ( contextMatchesLoc(vc, loc) ) { + final Allele myRef = vc.getReference(); + if ( ref == null || ref.length() < myRef.length() ) + ref = myRef; + else if ( ref.length() == myRef.length() && ! ref.equals(myRef) ) + throw new TribbleException(String.format("The provided variant file(s) have inconsistent references for the same position(s) at %s:%d, %s vs. %s", vc.getChr(), vc.getStart(), ref, myRef)); + } } return ref; } - static private AlleleMapper resolveIncompatibleAlleles(Allele refAllele, VariantContext vc, Set allAlleles) { + static private boolean contextMatchesLoc(final VariantContext vc, final GenomeLoc loc) { + return loc == null || loc.getStart() == vc.getStart(); + } + + /** + * Given the reference allele, determines the mapping for common alternate alleles in the list of VariantContexts. + * + * @param VCs the list of VariantContexts + * @param refAllele the reference allele + * @param loc if not null, ignore records that do not begin at this start location + * @return non-null AlleleMapper + */ + static private AlleleMapper determineAlternateAlleleMapping(final List VCs, final Allele refAllele, final GenomeLoc loc) { + final Map map = new HashMap<>(); + + for ( final VariantContext vc : VCs ) { + if ( contextMatchesLoc(vc, loc) ) + addAllAlternateAllelesToMap(vc, refAllele, map); + } + + return new AlleleMapper(map); + } + + /** + * Adds all of the alternate alleles from the VariantContext to the allele mapping (for use in creating the AlleleMapper) + * + * @param vc the VariantContext + * @param refAllele the reference allele + * @param map the allele mapping to populate + */ + static private void addAllAlternateAllelesToMap(final VariantContext vc, final Allele refAllele, final Map map) { + // if the ref allele matches, then just add the alts as is + if ( refAllele.equals(vc.getReference()) ) { + for ( final Allele altAllele : vc.getAlternateAlleles() ) { + // ignore symbolic alleles + if ( ! altAllele.isSymbolic() ) + map.put(altAllele, altAllele); + } + } + else { + map.putAll(createAlleleMapping(refAllele, vc, map.values())); + } + } + + static private AlleleMapper resolveIncompatibleAlleles(final Allele refAllele, final VariantContext vc, final Set allAlleles) { if ( refAllele.equals(vc.getReference()) ) return new AlleleMapper(vc); else { - // we really need to do some work. The refAllele is the longest reference allele seen at this - // start site. So imagine it is: - // - // refAllele: ACGTGA - // myRef: ACGT - // myAlt: A - // - // We need to remap all of the alleles in vc to include the extra GA so that - // myRef => refAllele and myAlt => AGA - // - - Allele myRef = vc.getReference(); - if ( refAllele.length() <= myRef.length() ) throw new IllegalStateException("BUG: myRef="+myRef+" is longer than refAllele="+refAllele); - byte[] extraBases = Arrays.copyOfRange(refAllele.getBases(), myRef.length(), refAllele.length()); - -// System.out.printf("Remapping allele at %s%n", vc); -// System.out.printf("ref %s%n", refAllele); -// System.out.printf("myref %s%n", myRef ); -// System.out.printf("extrabases %s%n", new String(extraBases)); - - Map map = new HashMap<>(); - for ( final Allele a : vc.getAlleles() ) { - if ( a.isReference() ) - map.put(a, refAllele); - else { - Allele extended = Allele.extend(a, extraBases); - for ( final Allele b : allAlleles ) - if ( extended.equals(b) ) - extended = b; -// System.out.printf(" Extending %s => %s%n", a, extended); - map.put(a, extended); - } - } - - // debugging -// System.out.printf("mapping %s%n", map); - + final Map map = createAlleleMapping(refAllele, vc, allAlleles); + map.put(vc.getReference(), refAllele); return new AlleleMapper(map); } } + /** + * Create an allele mapping for the given context where its reference allele must (potentially) be extended to the given allele + * + * The refAllele is the longest reference allele seen at this start site. + * So imagine it is: + * refAllele: ACGTGA + * myRef: ACGT + * myAlt: A + * + * We need to remap all of the alleles in vc to include the extra GA so that + * myRef => refAllele and myAlt => AGA + * + * @param refAllele the new (extended) reference allele + * @param oneVC the Variant Context to extend + * @param currentAlleles the list of alleles already created + * @return a non-null mapping of original alleles to new (extended) ones + */ + private static Map createAlleleMapping(final Allele refAllele, + final VariantContext oneVC, + final Collection currentAlleles) { + final Allele myRef = oneVC.getReference(); + if ( refAllele.length() <= myRef.length() ) throw new IllegalStateException("BUG: myRef="+myRef+" is longer than refAllele="+refAllele); + + final byte[] extraBases = Arrays.copyOfRange(refAllele.getBases(), myRef.length(), refAllele.length()); + + final Map map = new HashMap<>(); + for ( final Allele a : oneVC.getAlternateAlleles() ) { + if ( isUsableAlternateAllele(a) ) { + Allele extended = Allele.extend(a, extraBases); + for ( final Allele b : currentAlleles ) + if ( extended.equals(b) ) + extended = b; + map.put(a, extended); + } + } + + return map; + } + + static private boolean isUsableAlternateAllele(final Allele allele) { + return ! (allele.isReference() || allele.isSymbolic() ); + } + public static List sortVariantContextsByPriority(Collection unsortedVCs, List priorityListOfVCs, GenotypeMergeType mergeOption ) { if ( mergeOption == GenotypeMergeType.PRIORITIZE && priorityListOfVCs == null ) throw new IllegalArgumentException("Cannot merge calls by priority with a null priority list"); @@ -1352,6 +1510,124 @@ public class GATKVariantContextUtils { } } + /** + * Replaces any alleles in the list with NO CALLS, except for the generic ALT allele + * + * @param alleles list of alleles to replace + * @return non-null list of alleles + */ + private static List replaceWithNoCalls(final List alleles) { + if ( alleles == null ) throw new IllegalArgumentException("list of alleles cannot be null"); + + final List result = new ArrayList<>(alleles.size()); + for ( final Allele allele : alleles ) + result.add(allele.equals(NON_REF_SYMBOLIC_ALLELE) ? allele : Allele.NO_CALL); + return result; + } + + /** + * Merge into the context a new genotype represented by the given VariantContext for the provided list of target alleles. + * This method assumes that none of the alleles in the VC overlaps with any of the alleles in the set. + * + * @param mergedGenotypes the genotypes context to add to + * @param VC the Variant Context for the sample + * @param remappedAlleles the list of remapped alleles for the sample + * @param targetAlleles the list of target alleles + */ + private static void mergeRefConfidenceGenotypes(final GenotypesContext mergedGenotypes, + final VariantContext VC, + final List remappedAlleles, + final List targetAlleles) { + for ( final Genotype g : VC.getGenotypes() ) { + + // only add if the name is new + final String name = g.getSampleName(); + if ( !mergedGenotypes.containsSample(name) ) { + // we need to modify it even if it already contains all of the alleles because we need to purge the PLs out anyways + final int[] indexesOfRelevantAlleles = getIndexesOfRelevantAlleles(remappedAlleles, targetAlleles); + final int[] PLs = generatePLs(g, indexesOfRelevantAlleles); + // note that we set the alleles to null here (as we expect it to be re-genotyped) + final Genotype newG = new GenotypeBuilder(g).name(name).alleles(null).PL(PLs).noAD().noGQ().make(); + mergedGenotypes.add(newG); + } + } + } + + /** + * Determines the allele mapping from myAlleles to the targetAlleles, substituting the generic "" as appropriate. + * If the myAlleles set does not contain "" as an allele, it throws an exception. + * + * @param remappedAlleles the list of alleles to evaluate + * @param targetAlleles the target list of alleles + * @return non-null array of ints representing indexes + */ + protected static int[] getIndexesOfRelevantAlleles(final List remappedAlleles, final List targetAlleles) { + + if ( remappedAlleles == null || remappedAlleles.size() == 0 ) throw new IllegalArgumentException("The list of input alleles must not be null or empty"); + if ( targetAlleles == null || targetAlleles.size() == 0 ) throw new IllegalArgumentException("The list of target alleles must not be null or empty"); + + if ( !remappedAlleles.contains(NON_REF_SYMBOLIC_ALLELE) ) + throw new IllegalArgumentException("The list of input alleles must contain " + NON_REF_SYMBOLIC_ALLELE + " as an allele; please use the Haplotype Caller with gVCF output to generate appropriate records"); + final int indexOfGenericAlt = remappedAlleles.indexOf(NON_REF_SYMBOLIC_ALLELE); + + final int[] indexMapping = new int[targetAlleles.size()]; + + // the reference alleles always match up (even if they don't appear to) + indexMapping[0] = 0; + + // create the index mapping, using the allele whenever such a mapping doesn't exist + for ( int i = 1; i < targetAlleles.size(); i++ ) { + final int indexOfRemappedAllele = remappedAlleles.indexOf(targetAlleles.get(i)); + indexMapping[i] = indexOfRemappedAllele == -1 ? indexOfGenericAlt: indexOfRemappedAllele; + } + + return indexMapping; + } + + /** + * Generates new PLs given the set of indexes of the Genotype's current alleles from the original PLs. + * Throws an exception if the Genotype does not contain PLs. + * + * @param genotype the genotype from which to grab PLs + * @param indexesOfRelevantAlleles the indexes of the original alleles corresponding to the new alleles + * @return non-null array of new PLs + */ + protected static int[] generatePLs(final Genotype genotype, final int[] indexesOfRelevantAlleles) { + if ( !genotype.hasPL() ) + throw new IllegalArgumentException("Cannot generate new PLs from a genotype without PLs"); + + final int[] originalPLs = genotype.getPL(); + + // assume diploid + final int numLikelihoods = GenotypeLikelihoods.numLikelihoods(indexesOfRelevantAlleles.length, 2); + final int[] newPLs = new int[numLikelihoods]; + + for ( int i = 0; i < indexesOfRelevantAlleles.length; i++ ) { + for ( int j = i; j < indexesOfRelevantAlleles.length; j++ ) { + final int originalPLindex = calculatePLindexFromUnorderedIndexes(indexesOfRelevantAlleles[i], indexesOfRelevantAlleles[j]); + if ( originalPLindex >= originalPLs.length ) + throw new IllegalStateException("The original PLs do not have enough values; accessing index " + originalPLindex + " but size is " + originalPLs.length); + + final int newPLindex = GenotypeLikelihoods.calculatePLindex(i, j); + newPLs[newPLindex] = originalPLs[originalPLindex]; + } + } + + return newPLs; + } + + /** + * This is just a safe wrapper around GenotypeLikelihoods.calculatePLindex() + * + * @param originalIndex1 the index of the first allele + * @param originalIndex2 the index of the second allele + * @return the PL index + */ + protected static int calculatePLindexFromUnorderedIndexes(final int originalIndex1, final int originalIndex2) { + // we need to make sure they are ordered correctly + return ( originalIndex2 < originalIndex1 ) ? GenotypeLikelihoods.calculatePLindex(originalIndex2, originalIndex1) : GenotypeLikelihoods.calculatePLindex(originalIndex1, originalIndex2); + } + public static String mergedSampleName(String trackName, String sampleName, boolean uniquify ) { return uniquify ? sampleName + "." + trackName : sampleName; } @@ -1717,6 +1993,15 @@ public class GATKVariantContextUtils { } return newAs; } + + /** + * @return the list of unique values + */ + public List getUniqueMappedAlleles() { + if ( map == null ) + return Collections.emptyList(); + return new ArrayList<>(new HashSet<>(map.values())); + } } private static class CompareByPriority implements Comparator, Serializable { diff --git a/public/java/test/org/broadinstitute/sting/utils/variant/GATKVariantContextUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/variant/GATKVariantContextUtilsUnitTest.java index 30f112241..23a24e180 100644 --- a/public/java/test/org/broadinstitute/sting/utils/variant/GATKVariantContextUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/variant/GATKVariantContextUtilsUnitTest.java @@ -27,12 +27,9 @@ package org.broadinstitute.sting.utils.variant; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; -import org.broadinstitute.sting.utils.BaseUtils; -import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.variant.variantcontext.*; -import org.broadinstitute.variant.vcf.VCFConstants; import org.testng.Assert; import org.testng.annotations.BeforeSuite; import org.testng.annotations.DataProvider; @@ -182,7 +179,7 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest { final VariantContext merged = GATKVariantContextUtils.simpleMerge( inputs, priority, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, - GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, false, false, "set", false, false, false); + GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, false, false, "set", false, false); Assert.assertEquals(merged.getAlleles(), cfg.expected); } @@ -240,7 +237,7 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest { final VariantContext merged = GATKVariantContextUtils.simpleMerge( inputs, null, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, - GATKVariantContextUtils.GenotypeMergeType.UNSORTED, false, false, "set", false, false, false); + GATKVariantContextUtils.GenotypeMergeType.UNSORTED, false, false, "set", false, false); Assert.assertEquals(merged.getID(), cfg.expected); } @@ -355,7 +352,7 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest { public void testMergeFiltered(MergeFilteredTest cfg) { final List priority = vcs2priority(cfg.inputs); final VariantContext merged = GATKVariantContextUtils.simpleMerge( - cfg.inputs, priority, cfg.type, GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, true, false, "set", false, false, false); + cfg.inputs, priority, cfg.type, GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, true, false, "set", false, false); // test alleles are equal Assert.assertEquals(merged.getAlleles(), cfg.expected.getAlleles()); @@ -482,7 +479,7 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest { public void testMergeGenotypes(MergeGenotypesTest cfg) { final VariantContext merged = GATKVariantContextUtils.simpleMerge( cfg.inputs, cfg.priority, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, - GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, true, false, "set", false, false, false); + GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, true, false, "set", false, false); // test alleles are equal Assert.assertEquals(merged.getAlleles(), cfg.expected.getAlleles()); @@ -523,7 +520,7 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest { final VariantContext merged = GATKVariantContextUtils.simpleMerge( Arrays.asList(vc1, vc2), null, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, - GATKVariantContextUtils.GenotypeMergeType.UNIQUIFY, false, false, "set", false, false, false); + GATKVariantContextUtils.GenotypeMergeType.UNIQUIFY, false, false, "set", false, false); // test genotypes Assert.assertEquals(merged.getSampleNames(), new HashSet<>(Arrays.asList("s1.1", "s1.2"))); @@ -556,7 +553,7 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest { final VariantContext merged = GATKVariantContextUtils.simpleMerge( Arrays.asList(vc1, vc2), priority, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, - GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, annotate, false, set, false, false, false); + GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, annotate, false, set, false, false); if ( annotate ) Assert.assertEquals(merged.getAttribute(set), GATKVariantContextUtils.MERGE_INTERSECTION); @@ -1036,26 +1033,6 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest { } } - @Test(enabled = !DEBUG) - public void testFillInNonRefSymbolicAlleles() { - final int start = 10; - final String ref = "A"; - final String alt = "C"; - final VariantContext vcAlt = GATKVariantContextUtils.makeFromAlleles("test", "20", start, Arrays.asList(ref, alt)); - final VariantContext vcRef = GATKVariantContextUtils.makeFromAlleles("test", "20", start, Arrays.asList(ref, "<"+GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE_NAME+">")); - - List VCs = Arrays.asList(vcAlt, vcRef); - VCs = GATKVariantContextUtils.fillInNonRefSymbolicAlleles(VCs, Collections.emptyList()); - - // make sure the non ref symbolic alleles have all been filled in with the appropriate alternate allele - for( final VariantContext vc : VCs ) { - Assert.assertTrue(vc.getAlternateAlleles().size() == 1); - Assert.assertTrue(vc.getAlternateAllele(0).isNonReference()); - Assert.assertTrue(!vc.getReference().isSymbolic()); - Assert.assertTrue(!vc.getAlternateAllele(0).isSymbolic()); - } - } - // -------------------------------------------------------------------------------- // // test allele remapping @@ -1428,4 +1405,225 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest { assertGenotypesAreEqual(actualGT, expected); } } + + // -------------------------------------------------------------------------------- + // + // Test methods for merging reference confidence VCs + // + // -------------------------------------------------------------------------------- + + @DataProvider(name = "generatePLsData") + public Object[][] makeGeneratePLsData() { + final List tests = new ArrayList<>(); + + for ( int originalAlleles = 2; originalAlleles <= 5; originalAlleles++ ) { + for ( int swapPosition1 = 0; swapPosition1 < originalAlleles; swapPosition1++ ) { + for ( int swapPosition2 = swapPosition1+1; swapPosition2 < originalAlleles; swapPosition2++ ) { + final int[] indexes = new int[originalAlleles]; + for ( int i = 0; i < originalAlleles; i++ ) + indexes[i] = i; + indexes[swapPosition1] = swapPosition2; + indexes[swapPosition2] = swapPosition1; + tests.add(new Object[]{originalAlleles, indexes}); + } + } + } + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "generatePLsData") + public void testGeneratePLs(final int numOriginalAlleles, final int[] indexOrdering) { + + final int numLikelihoods = GenotypeLikelihoods.numLikelihoods(numOriginalAlleles, 2); + final int[] PLs = new int[numLikelihoods]; + for ( int i = 0; i < numLikelihoods; i++ ) + PLs[i] = i; + + final List alleles = new ArrayList<>(numOriginalAlleles); + alleles.add(Allele.create("A", true)); + for ( int i = 1; i < numOriginalAlleles; i++ ) + alleles.add(Allele.create(Utils.dupString('A', i + 1), false)); + final Genotype genotype = new GenotypeBuilder("foo", alleles).PL(PLs).make(); + + final int[] newPLs = GATKVariantContextUtils.generatePLs(genotype, indexOrdering); + + Assert.assertEquals(newPLs.length, numLikelihoods); + + final int[] expectedPLs = new int[numLikelihoods]; + for ( int i = 0; i < numOriginalAlleles; i++ ) { + for ( int j = i; j < numOriginalAlleles; j++ ) { + final int index = GenotypeLikelihoods.calculatePLindex(i, j); + final int value = GATKVariantContextUtils.calculatePLindexFromUnorderedIndexes(indexOrdering[i], indexOrdering[j]); + expectedPLs[index] = value; + } + } + + for ( int i = 0; i < numLikelihoods; i++ ) { + Assert.assertEquals(newPLs[i], expectedPLs[i]); + } + } + + @Test(expectedExceptions = IllegalArgumentException.class) + public void testGetIndexesOfRelevantAllelesWithNoALT() { + + final List alleles1 = new ArrayList<>(1); + alleles1.add(Allele.create("A", true)); + final List alleles2 = new ArrayList<>(1); + alleles2.add(Allele.create("A", true)); + GATKVariantContextUtils.getIndexesOfRelevantAlleles(alleles1, alleles2); + Assert.fail("We should have thrown an exception because the allele was not present"); + } + + @DataProvider(name = "getIndexesOfRelevantAllelesData") + public Object[][] makeGetIndexesOfRelevantAllelesData() { + final int totalAlleles = 5; + final List alleles = new ArrayList<>(totalAlleles); + alleles.add(Allele.create("A", true)); + for ( int i = 1; i < totalAlleles; i++ ) + alleles.add(Allele.create(Utils.dupString('A', i + 1), false)); + + final List tests = new ArrayList<>(); + + for ( int alleleIndex = 0; alleleIndex < totalAlleles; alleleIndex++ ) { + tests.add(new Object[]{alleleIndex, alleles}); + } + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "getIndexesOfRelevantAllelesData") + public void testGetIndexesOfRelevantAlleles(final int allelesIndex, final List allAlleles) { + final List myAlleles = new ArrayList<>(3); + + // always add the reference and alleles + myAlleles.add(allAlleles.get(0)); + myAlleles.add(GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); + // optionally add another alternate allele + if ( allelesIndex > 0 ) + myAlleles.add(allAlleles.get(allelesIndex)); + + final int[] indexes = GATKVariantContextUtils.getIndexesOfRelevantAlleles(myAlleles, allAlleles); + + Assert.assertEquals(indexes.length, allAlleles.size()); + + for ( int i = 0; i < allAlleles.size(); i++ ) { + if ( i == 0 ) + Assert.assertEquals(indexes[i], 0); // ref should always match + else if ( i == allelesIndex ) + Assert.assertEquals(indexes[i], 2); // allele + else + Assert.assertEquals(indexes[i], 1); // + } + } + + @DataProvider(name = "referenceConfidenceMergeData") + public Object[][] makeReferenceConfidenceMergeData() { + final List tests = new ArrayList<>(); + + final int start = 10; + final GenomeLoc loc = new UnvalidatingGenomeLoc("20", 0, start, start); + final VariantContext VCbase = new VariantContextBuilder("test", "20", start, start, Arrays.asList(Aref)).make(); + final VariantContext VCprevBase = new VariantContextBuilder("test", "20", start-1, start-1, Arrays.asList(Aref)).make(); + + final int[] standardPLs = new int[]{30, 20, 10, 71, 72, 73}; + final int[] reorderedSecondAllelePLs = new int[]{30, 71, 73, 20, 72, 10}; + + final List A_ALT = Arrays.asList(Aref, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); + final Genotype gA_ALT = new GenotypeBuilder("A").PL(new int[]{0, 100, 1000}).make(); + final VariantContext vcA_ALT = new VariantContextBuilder(VCbase).alleles(A_ALT).genotypes(gA_ALT).make(); + final Allele AAref = Allele.create("AA", true); + final List AA_ALT = Arrays.asList(AAref, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); + final Genotype gAA_ALT = new GenotypeBuilder("AA").PL(new int[]{0, 80, 800}).make(); + final VariantContext vcAA_ALT = new VariantContextBuilder(VCprevBase).alleles(AA_ALT).genotypes(gAA_ALT).make(); + final List A_C = Arrays.asList(Aref, C); + final Genotype gA_C = new GenotypeBuilder("A_C").PL(new int[]{30, 20, 10}).make(); + final List A_C_ALT = Arrays.asList(Aref, C, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); + final Genotype gA_C_ALT = new GenotypeBuilder("A_C").PL(standardPLs).make(); + final VariantContext vcA_C_ALT = new VariantContextBuilder(VCbase).alleles(A_C_ALT).genotypes(gA_C_ALT).make(); + final List A_G_ALT = Arrays.asList(Aref, G, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); + final Genotype gA_G_ALT = new GenotypeBuilder("A_G").PL(standardPLs).make(); + final VariantContext vcA_G_ALT = new VariantContextBuilder(VCbase).alleles(A_G_ALT).genotypes(gA_G_ALT).make(); + final List A_C_G = Arrays.asList(Aref, C, G); + final Genotype gA_C_G = new GenotypeBuilder("A_C_G").PL(new int[]{40, 20, 30, 20, 10, 30}).make(); + final List A_C_G_ALT = Arrays.asList(Aref, C, G, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); + final Genotype gA_C_G_ALT = new GenotypeBuilder("A_C_G").PL(new int[]{40, 20, 30, 20, 10, 30, 71, 72, 73, 74}).make(); + final VariantContext vcA_C_G_ALT = new VariantContextBuilder(VCbase).alleles(A_C_G_ALT).genotypes(gA_C_G_ALT).make(); + final List A_ATC_ALT = Arrays.asList(Aref, ATC, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); + final Genotype gA_ATC_ALT = new GenotypeBuilder("A_ATC").PL(standardPLs).make(); + final VariantContext vcA_ATC_ALT = new VariantContextBuilder(VCbase).alleles(A_ATC_ALT).genotypes(gA_ATC_ALT).make(); + final Allele A = Allele.create("A", false); + final List AA_A_ALT = Arrays.asList(AAref, A, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE); + final Genotype gAA_A_ALT = new GenotypeBuilder("AA_A").PL(standardPLs).make(); + final VariantContext vcAA_A_ALT = new VariantContextBuilder(VCprevBase).alleles(AA_A_ALT).genotypes(gAA_A_ALT).make(); + + // first test the case of a single record + tests.add(new Object[]{Arrays.asList(vcA_C_ALT), + loc, false, + new VariantContextBuilder(VCbase).alleles(A_C).genotypes(gA_C).make()}); + + // now, test pairs: + // a SNP with another SNP + tests.add(new Object[]{Arrays.asList(vcA_C_ALT, vcA_G_ALT), + loc, false, + new VariantContextBuilder(VCbase).alleles(A_C_G).genotypes(gA_C_ALT, new GenotypeBuilder("A_G").PL(reorderedSecondAllelePLs).make()).make()}); + // a SNP with an indel + tests.add(new Object[]{Arrays.asList(vcA_C_ALT, vcA_ATC_ALT), + loc, false, + new VariantContextBuilder(VCbase).alleles(Arrays.asList(Aref, C, ATC)).genotypes(gA_C_ALT, new GenotypeBuilder("A_ATC").PL(reorderedSecondAllelePLs).make()).make()}); + // a SNP with 2 SNPs + tests.add(new Object[]{Arrays.asList(vcA_C_ALT, vcA_C_G_ALT), + loc, false, + new VariantContextBuilder(VCbase).alleles(A_C_G).genotypes(gA_C_ALT, gA_C_G).make()}); + // a SNP with a ref record + tests.add(new Object[]{Arrays.asList(vcA_C_ALT, vcA_ALT), + loc, false, + new VariantContextBuilder(VCbase).alleles(A_C).genotypes(gA_C, gA_ALT).make()}); + + // spanning records: + // a SNP with a spanning ref record + tests.add(new Object[]{Arrays.asList(vcA_C_ALT, vcAA_ALT), + loc, false, + new VariantContextBuilder(VCbase).alleles(A_C).genotypes(gA_C, gAA_ALT).make()}); + // a SNP with a spanning deletion + tests.add(new Object[]{Arrays.asList(vcA_C_ALT, vcAA_A_ALT), + loc, false, + new VariantContextBuilder(VCbase).alleles(A_C).genotypes(gA_C, new GenotypeBuilder("AA_A").PL(new int[]{30, 71, 73}).make()).make()}); + + // combination of all + tests.add(new Object[]{Arrays.asList(vcA_C_ALT, vcA_G_ALT, vcA_ATC_ALT, vcA_C_G_ALT, vcA_ALT, vcAA_ALT, vcAA_A_ALT), + loc, false, + new VariantContextBuilder(VCbase).alleles(Arrays.asList(Aref, C, ATC, G)).genotypes(new GenotypeBuilder("A_C").PL(new int[]{30, 20, 10, 71, 72, 73, 71, 72, 73, 73}).make(), + new GenotypeBuilder("A_G").PL(new int[]{30, 71, 73, 71, 73, 73, 20, 72, 72, 10}).make(), + new GenotypeBuilder("A_ATC").PL(new int[]{30, 71, 73, 20, 72, 10, 71, 73, 72, 73}).make(), + new GenotypeBuilder("A_C_G").PL(new int[]{40, 20, 30, 71, 72, 74, 20, 10, 73, 30}).make(), + new GenotypeBuilder("A").PL(new int[]{0, 100, 1000, 100, 1000, 1000, 100, 1000, 1000, 1000}).make(), + new GenotypeBuilder("AA").PL(new int[]{0, 80, 800, 80, 800, 800, 80, 800, 800, 800}).make(), + new GenotypeBuilder("AA_A").PL(new int[]{30, 71, 73, 71, 73, 73, 71, 73, 73, 73}).make()).make()}); + + // just spanning ref contexts, trying both instances where we want/do not want ref-only contexts + tests.add(new Object[]{Arrays.asList(vcAA_ALT), + loc, false, + null}); + tests.add(new Object[]{Arrays.asList(vcAA_ALT), + loc, true, + new VariantContextBuilder(VCbase).alleles(Arrays.asList(Allele.create("A", true))).genotypes(new GenotypeBuilder("AA").PL(new int[]{0}).make()).make()}); + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "referenceConfidenceMergeData") + public void testReferenceConfidenceMerge(final List toMerge, final GenomeLoc loc, final boolean returnSiteEvenIfMonomorphic, final VariantContext expectedResult) { + final VariantContext result = GATKVariantContextUtils.referenceConfidenceMerge(toMerge, loc, returnSiteEvenIfMonomorphic ? (byte)'A' : null); + if ( result == null ) { + Assert.assertTrue(expectedResult == null); + return; + } + Assert.assertEquals(result.getAlleles(), expectedResult.getAlleles()); + Assert.assertEquals(result.getNSamples(), expectedResult.getNSamples()); + for ( final Genotype expectedGenotype : expectedResult.getGenotypes() ) { + Assert.assertTrue(result.hasGenotype(expectedGenotype.getSampleName()), "Missing " + expectedGenotype.getSampleName()); + // use string comparisons to test equality for now + Assert.assertEquals(result.getGenotype(expectedGenotype.getSampleName()).toString(), expectedGenotype.toString()); + } + } } \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/utils/variant/VariantContextBenchmark.java b/public/java/test/org/broadinstitute/sting/utils/variant/VariantContextBenchmark.java index a1b75a3f1..bb794dca4 100644 --- a/public/java/test/org/broadinstitute/sting/utils/variant/VariantContextBenchmark.java +++ b/public/java/test/org/broadinstitute/sting/utils/variant/VariantContextBenchmark.java @@ -234,7 +234,7 @@ public class VariantContextBenchmark extends SimpleBenchmark { GATKVariantContextUtils.simpleMerge(toMerge, null, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, GATKVariantContextUtils.GenotypeMergeType.UNSORTED, - true, false, "set", false, true, false); + true, false, "set", false, true); } };