Merge pull request #456 from broadinstitute/eb_unify_hc_combination_steps

Created a new walker to do the full combination of N gVCFs from the HC single-sample ref calc pipeline.
This commit is contained in:
Eric Banks 2013-12-31 18:57:27 -08:00
commit 5a1564d1f2
10 changed files with 998 additions and 206 deletions

View File

@ -286,7 +286,7 @@ public class ConsensusAlleleCounter {
if (vcs.isEmpty())
return Collections.emptyList(); // nothing else to do, no alleles passed minimum count criterion
final VariantContext mergedVC = GATKVariantContextUtils.simpleMerge(vcs, null, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, GATKVariantContextUtils.GenotypeMergeType.UNSORTED, false, false, null, false, false, false);
final VariantContext mergedVC = GATKVariantContextUtils.simpleMerge(vcs, null, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, GATKVariantContextUtils.GenotypeMergeType.UNSORTED, false, false, null, false, false);
return mergedVC.getAlleles();
}
}

View File

@ -183,7 +183,7 @@ public class GenotypingEngine {
final List<String> priorityList = makePriorityList(eventsAtThisLoc);
// Merge the event to find a common reference representation
final VariantContext mergedVC = GATKVariantContextUtils.simpleMerge(eventsAtThisLoc, priorityList, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, false, false, null, false, false, false);
final VariantContext mergedVC = GATKVariantContextUtils.simpleMerge(eventsAtThisLoc, priorityList, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, false, false, null, false, false);
if( mergedVC == null ) { continue; }
if( eventsAtThisLoc.size() != mergedVC.getAlternateAlleles().size() ) {

View File

@ -0,0 +1,228 @@
/*
* By downloading the PROGRAM you agree to the following terms of use:
*
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
*
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
*
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
*
* 1. DEFINITIONS
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
*
* 2. LICENSE
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
*
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
* Copyright 2012 Broad Institute, Inc.
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
*
* 4. INDEMNIFICATION
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
*
* 5. NO REPRESENTATIONS OR WARRANTIES
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
*
* 6. ASSIGNMENT
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
*
* 7. MISCELLANEOUS
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
*/
package org.broadinstitute.sting.gatk.walkers.variantutils;
import org.broadinstitute.sting.commandline.*;
import org.broadinstitute.sting.gatk.CommandLineGATK;
import org.broadinstitute.sting.gatk.arguments.DbsnpArgumentCollection;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.Reference;
import org.broadinstitute.sting.gatk.walkers.RodWalker;
import org.broadinstitute.sting.gatk.walkers.TreeReducible;
import org.broadinstitute.sting.gatk.walkers.Window;
import org.broadinstitute.sting.gatk.walkers.annotator.ChromosomeCountConstants;
import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible;
import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel;
import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedArgumentCollection;
import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.SampleUtils;
import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
import org.broadinstitute.sting.utils.help.HelpConstants;
import org.broadinstitute.sting.utils.variant.GATKVCFUtils;
import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils;
import org.broadinstitute.variant.variantcontext.VariantContext;
import org.broadinstitute.variant.variantcontext.writer.VariantContextWriter;
import org.broadinstitute.variant.vcf.*;
import java.util.*;
/**
* Combines gVCF records that were produced by the Haplotype Caller from single sample sources.
*
* <p>
* CombineReferenceCalculationVariants combines gVCF records that were produced as part of the "single sample discovery"
* pipeline using the '-ERC GVCF' mode of the Haplotype Caller. This tools performs the multi-sample joint aggregation
* step and merges the records together in a sophisticated manner.
*
* At all positions of the target, this tool will combine all spanning records, produce correct genotype likelihoods,
* re-genotype the newly merged record, and then re-annotate it.
*
*
* <h3>Input</h3>
* <p>
* One or more Haplotype Caller gVCFs to combine.
* </p>
*
* <h3>Output</h3>
* <p>
* A combined VCF.
* </p>
*
* <h3>Examples</h3>
* <pre>
* java -Xmx2g -jar GenomeAnalysisTK.jar \
* -R ref.fasta \
* -T CombineReferenceCalculationVariants \
* --variant input1.vcf \
* --variant input2.vcf \
* -o output.vcf
* </pre>
*
*/
@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARMANIP, extraDocs = {CommandLineGATK.class} )
@Reference(window=@Window(start=-10,stop=10))
public class CombineReferenceCalculationVariants extends RodWalker<VariantContext, VariantContextWriter> implements AnnotatorCompatible, TreeReducible<VariantContextWriter> {
// TODO -- allow a file of VCF paths to be entered?
/**
* The VCF files to merge together
*/
@Input(fullName="variant", shortName = "V", doc="One or more input VCF files", required=true)
public List<RodBinding<VariantContext>> variants;
@Output(doc="File to which variants should be written")
protected VariantContextWriter vcfWriter = null;
@Argument(fullName="includeNonVariants", shortName="inv", doc="Include loci found to be non-variant after the combining procedure", required=false)
public boolean INCLUDE_NON_VARIANTS = false;
/**
* Which annotations to recompute for the combined output VCF file.
*/
@Advanced
@Argument(fullName="annotation", shortName="A", doc="One or more specific annotations to recompute", required=false)
protected List<String> annotationsToUse = new ArrayList<>(Arrays.asList(new String[]{"InbreedingCoeff", "FisherStrand", "QualByDepth", "ChromosomeCounts"}));
/**
* rsIDs from this file are used to populate the ID column of the output. Also, the DB INFO flag will be set when appropriate.
* dbSNP is not used in any way for the calculations themselves.
*/
@ArgumentCollection
protected DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection();
public RodBinding<VariantContext> getDbsnpRodBinding() { return dbsnp.dbsnp; }
// the genotyping engine
private UnifiedGenotyperEngine genotypingEngine;
// the annotation engine
private VariantAnnotatorEngine annotationEngine;
public List<RodBinding<VariantContext>> getCompRodBindings() { return Collections.emptyList(); }
public RodBinding<VariantContext> getSnpEffRodBinding() { return null; }
public List<RodBinding<VariantContext>> getResourceRodBindings() { return Collections.emptyList(); }
public boolean alwaysAppendDbsnpId() { return false; }
public void initialize() {
// take care of the VCF headers
final Map<String, VCFHeader> vcfRods = GATKVCFUtils.getVCFHeadersFromRods(getToolkit());
final Set<String> samples = SampleUtils.getSampleList(vcfRods, GATKVariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE);
final Set<VCFHeaderLine> headerLines = VCFUtils.smartMergeHeaders(vcfRods.values(), true);
headerLines.addAll(Arrays.asList(ChromosomeCountConstants.descriptions));
final VCFHeader vcfHeader = new VCFHeader(headerLines, samples);
vcfWriter.writeHeader(vcfHeader);
// create the genotyping engine
final UnifiedArgumentCollection UAC = new UnifiedArgumentCollection();
UAC.GLmodel = GenotypeLikelihoodsCalculationModel.Model.BOTH;
UAC.OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_ALL_SITES;
UAC.GenotypingMode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES;
genotypingEngine = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, null, null, samples, GATKVariantContextUtils.DEFAULT_PLOIDY);
// create the annotation engine
annotationEngine = new VariantAnnotatorEngine(Arrays.asList("none"), annotationsToUse, Collections.<String>emptyList(), this, getToolkit());
}
public VariantContext map(final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context) {
if ( tracker == null ) // RodWalkers can make funky map calls
return null;
final GenomeLoc loc = ref.getLocus();
final VariantContext combinedVC = GATKVariantContextUtils.referenceConfidenceMerge(tracker.getPrioritizedValue(variants, loc), loc, INCLUDE_NON_VARIANTS ? ref.getBase() : null);
if ( combinedVC == null )
return null;
return regenotypeVC(tracker, ref, combinedVC);
}
/**
* Re-genotype (and re-annotate) a combined genomic VC
*
* @param tracker the ref tracker
* @param ref the ref context
* @param combinedVC the combined genomic VC
* @return a new VariantContext or null if the site turned monomorphic and we don't want such sites
*/
protected VariantContext regenotypeVC(final RefMetaDataTracker tracker, final ReferenceContext ref, final VariantContext combinedVC) {
if ( combinedVC == null ) throw new IllegalArgumentException("combinedVC cannot be null");
VariantContext result = combinedVC;
// only re-genotype polymorphic sites
if ( combinedVC.isVariant() )
result = genotypingEngine.calculateGenotypes(result);
// if it turned monomorphic and we don't want such sites, quit
if ( !INCLUDE_NON_VARIANTS && result.isMonomorphicInSamples() )
return null;
// re-annotate it
return annotationEngine.annotateContext(tracker, ref, null, result);
}
public VariantContextWriter reduceInit() {
return vcfWriter;
}
public VariantContextWriter reduce(final VariantContext vc, final VariantContextWriter writer) {
if ( vc != null )
writer.add(vc);
return writer;
}
@Override
public VariantContextWriter treeReduce(final VariantContextWriter lhs, final VariantContextWriter rhs) {
return lhs;
}
@Override
public void onTraversalDone(final VariantContextWriter writer) {}
}

View File

@ -0,0 +1,72 @@
/*
* By downloading the PROGRAM you agree to the following terms of use:
*
* BROAD INSTITUTE - SOFTWARE LICENSE AGREEMENT - FOR ACADEMIC NON-COMMERCIAL RESEARCH PURPOSES ONLY
*
* This Agreement is made between the Broad Institute, Inc. with a principal address at 7 Cambridge Center, Cambridge, MA 02142 (BROAD) and the LICENSEE and is effective at the date the downloading is completed (EFFECTIVE DATE).
*
* WHEREAS, LICENSEE desires to license the PROGRAM, as defined hereinafter, and BROAD wishes to have this PROGRAM utilized in the public interest, subject only to the royalty-free, nonexclusive, nontransferable license rights of the United States Government pursuant to 48 CFR 52.227-14; and
* WHEREAS, LICENSEE desires to license the PROGRAM and BROAD desires to grant a license on the following terms and conditions.
* NOW, THEREFORE, in consideration of the promises and covenants made herein, the parties hereto agree as follows:
*
* 1. DEFINITIONS
* 1.1 PROGRAM shall mean copyright in the object code and source code known as GATK2 and related documentation, if any, as they exist on the EFFECTIVE DATE and can be downloaded from http://www.broadinstitute/GATK on the EFFECTIVE DATE.
*
* 2. LICENSE
* 2.1 Grant. Subject to the terms of this Agreement, BROAD hereby grants to LICENSEE, solely for academic non-commercial research purposes, a non-exclusive, non-transferable license to: (a) download, execute and display the PROGRAM and (b) create bug fixes and modify the PROGRAM.
* The LICENSEE may apply the PROGRAM in a pipeline to data owned by users other than the LICENSEE and provide these users the results of the PROGRAM provided LICENSEE does so for academic non-commercial purposes only. For clarification purposes, academic sponsored research is not a commercial use under the terms of this Agreement.
* 2.2 No Sublicensing or Additional Rights. LICENSEE shall not sublicense or distribute the PROGRAM, in whole or in part, without prior written permission from BROAD. LICENSEE shall ensure that all of its users agree to the terms of this Agreement. LICENSEE further agrees that it shall not put the PROGRAM on a network, server, or other similar technology that may be accessed by anyone other than the LICENSEE and its employees and users who have agreed to the terms of this agreement.
* 2.3 License Limitations. Nothing in this Agreement shall be construed to confer any rights upon LICENSEE by implication, estoppel, or otherwise to any computer software, trademark, intellectual property, or patent rights of BROAD, or of any other entity, except as expressly granted herein. LICENSEE agrees that the PROGRAM, in whole or part, shall not be used for any commercial purpose, including without limitation, as the basis of a commercial software or hardware product or to provide services. LICENSEE further agrees that the PROGRAM shall not be copied or otherwise adapted in order to circumvent the need for obtaining a license for use of the PROGRAM.
*
* 3. OWNERSHIP OF INTELLECTUAL PROPERTY
* LICENSEE acknowledges that title to the PROGRAM shall remain with BROAD. The PROGRAM is marked with the following BROAD copyright notice and notice of attribution to contributors. LICENSEE shall retain such notice on all copies. LICENSEE agrees to include appropriate attribution if any results obtained from use of the PROGRAM are included in any publication.
* Copyright 2012 Broad Institute, Inc.
* Notice of attribution: The GATK2 program was made available through the generosity of Medical and Population Genetics program at the Broad Institute, Inc.
* LICENSEE shall not use any trademark or trade name of BROAD, or any variation, adaptation, or abbreviation, of such marks or trade names, or any names of officers, faculty, students, employees, or agents of BROAD except as states above for attribution purposes.
*
* 4. INDEMNIFICATION
* LICENSEE shall indemnify, defend, and hold harmless BROAD, and their respective officers, faculty, students, employees, associated investigators and agents, and their respective successors, heirs and assigns, (Indemnitees), against any liability, damage, loss, or expense (including reasonable attorneys fees and expenses) incurred by or imposed upon any of the Indemnitees in connection with any claims, suits, actions, demands or judgments arising out of any theory of liability (including, without limitation, actions in the form of tort, warranty, or strict liability and regardless of whether such action has any factual basis) pursuant to any right or license granted under this Agreement.
*
* 5. NO REPRESENTATIONS OR WARRANTIES
* THE PROGRAM IS DELIVERED AS IS. BROAD MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE PROGRAM OR THE COPYRIGHT, EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, WHETHER OR NOT DISCOVERABLE. BROAD EXTENDS NO WARRANTIES OF ANY KIND AS TO PROGRAM CONFORMITY WITH WHATEVER USER MANUALS OR OTHER LITERATURE MAY BE ISSUED FROM TIME TO TIME.
* IN NO EVENT SHALL BROAD OR ITS RESPECTIVE DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATED INVESTIGATORS AND AFFILIATES BE LIABLE FOR INCIDENTAL OR CONSEQUENTIAL DAMAGES OF ANY KIND, INCLUDING, WITHOUT LIMITATION, ECONOMIC DAMAGES OR INJURY TO PROPERTY AND LOST PROFITS, REGARDLESS OF WHETHER BROAD SHALL BE ADVISED, SHALL HAVE OTHER REASON TO KNOW, OR IN FACT SHALL KNOW OF THE POSSIBILITY OF THE FOREGOING.
*
* 6. ASSIGNMENT
* This Agreement is personal to LICENSEE and any rights or obligations assigned by LICENSEE without the prior written consent of BROAD shall be null and void.
*
* 7. MISCELLANEOUS
* 7.1 Export Control. LICENSEE gives assurance that it will comply with all United States export control laws and regulations controlling the export of the PROGRAM, including, without limitation, all Export Administration Regulations of the United States Department of Commerce. Among other things, these laws and regulations prohibit, or require a license for, the export of certain types of software to specified countries.
* 7.2 Termination. LICENSEE shall have the right to terminate this Agreement for any reason upon prior written notice to BROAD. If LICENSEE breaches any provision hereunder, and fails to cure such breach within thirty (30) days, BROAD may terminate this Agreement immediately. Upon termination, LICENSEE shall provide BROAD with written assurance that the original and all copies of the PROGRAM have been destroyed, except that, upon prior written authorization from BROAD, LICENSEE may retain a copy for archive purposes.
* 7.3 Survival. The following provisions shall survive the expiration or termination of this Agreement: Articles 1, 3, 4, 5 and Sections 2.2, 2.3, 7.3, and 7.4.
* 7.4 Notice. Any notices under this Agreement shall be in writing, shall specifically refer to this Agreement, and shall be sent by hand, recognized national overnight courier, confirmed facsimile transmission, confirmed electronic mail, or registered or certified mail, postage prepaid, return receipt requested. All notices under this Agreement shall be deemed effective upon receipt.
* 7.5 Amendment and Waiver; Entire Agreement. This Agreement may be amended, supplemented, or otherwise modified only by means of a written instrument signed by all parties. Any waiver of any rights or failure to act in a specific instance shall relate only to such instance and shall not be construed as an agreement to waive any rights or fail to act in any other instance, whether or not similar. This Agreement constitutes the entire agreement among the parties with respect to its subject matter and supersedes prior agreements or understandings between the parties relating to its subject matter.
* 7.6 Binding Effect; Headings. This Agreement shall be binding upon and inure to the benefit of the parties and their respective permitted successors and assigns. All headings are for convenience only and shall not affect the meaning of any provision of this Agreement.
* 7.7 Governing Law. This Agreement shall be construed, governed, interpreted and applied in accordance with the internal laws of the Commonwealth of Massachusetts, U.S.A., without regard to conflict of laws principles.
*/
package org.broadinstitute.sting.gatk.walkers.variantutils;
import org.broadinstitute.sting.WalkerTest;
import org.testng.annotations.Test;
import java.util.Arrays;
public class CombineReferenceCalculationVariantsIntegrationTest extends WalkerTest {
private static String baseTestString(String args, String ref) {
return "-T CombineReferenceCalculationVariants --no_cmdline_in_header -L 1:1-50,000,000 -o %s -R " + ref + args;
}
// TODO -- enable this test (and create others) once the Haplotype Caller produces appropriate gVCFs (with <NON-REF> for every record)
@Test(enabled = false)
public void combineSingleSamplePipelineGVCF() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString(" -V:sample1 " + privateTestDir + "combine.single.sample.pipeline.1.vcf" +
" -V:sample2 " + privateTestDir + "combine.single.sample.pipeline.2.vcf" +
" -V:sample3 " + privateTestDir + "combine.single.sample.pipeline.3.vcf" +
" -L 20:10,000,000-10,001,000", b37KGReference),
1,
Arrays.asList("0413f0725fc5ec3a4f1ee246f6cb3a2a"));
executeTest("combineSingleSamplePipelineGVCF", spec);
}
}

View File

@ -185,19 +185,6 @@ public class CombineVariantsIntegrationTest extends WalkerTest {
@Test public void complexTestSitesOnly() { combineComplexSites(" -sites_only", "46bbbbb8fc9ae6467a4f8fe35b8d7d14"); }
@Test public void complexTestSitesOnlyMinimal() { combineComplexSites(" -sites_only -minimalVCF", "46bbbbb8fc9ae6467a4f8fe35b8d7d14"); }
@Test public void combineSingleSamplePipelineGVCF() {
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString(" -V:sample1 " + privateTestDir + "combine.single.sample.pipeline.1.vcf" +
" -V:sample2 " + privateTestDir + "combine.single.sample.pipeline.2.vcf" +
" -V:sample3 " + privateTestDir + "combine.single.sample.pipeline.3.vcf" +
" -multipleAllelesMergeType MIX_TYPES" +
" --excludeNonVariants -combineAnnotations -setKey null" +
" -L 20:10,000,000-10,001,000", b37KGReference),
1,
Arrays.asList("0413f0725fc5ec3a4f1ee246f6cb3a2a"));
cvExecuteTest("combineSingleSamplePipelineGVCF", spec, true);
}
@Test
public void combineDBSNPDuplicateSites() {
WalkerTestSpec spec = new WalkerTestSpec(

View File

@ -170,7 +170,36 @@ public class RefMetaDataTracker {
@Requires({"type != null", "onlyAtThisLoc != null"})
public <T extends Feature> T getFirstValue(final Class<T> type, final GenomeLoc onlyAtThisLoc) {
return safeGetFirst(getValues(type, onlyAtThisLoc));
}
/**
* Same logic as @link #getFirstValue(RodBinding, boolean) but prioritizes records from prioritizeThisLoc if available
*
* @param rodBindings Only Features coming from the tracks associated with one of rodBindings are fetched
* @param <T> The Tribble Feature type of the rodBinding, and consequently the type of the resulting list of Features
* @param prioritizeThisLoc only Features starting at this site are considered
* @return A freshly allocated list of all of the bindings, or an empty list if none are bound.
*/
@Requires({"rodBindings != null", "prioritizeThisLoc != null"})
@Ensures("result != null")
public <T extends Feature> List<T> getPrioritizedValue(final Collection<RodBinding<T>> rodBindings, final GenomeLoc prioritizeThisLoc) {
final List<T> results = new ArrayList<>();
for ( final RodBinding<T> rodBinding : rodBindings ) {
// if there's a value at the prioritized location, take it
T value = getFirstValue(rodBinding, prioritizeThisLoc);
// otherwise, grab any one
if ( value == null )
value = getFirstValue(rodBinding);
// add if not null
if ( value != null )
results.add(value);
}
return results;
}
/**

View File

@ -191,9 +191,6 @@ public class CombineVariants extends RodWalker<Integer, Integer> implements Tree
@Argument(fullName="mergeInfoWithMaxAC", shortName="mergeInfoWithMaxAC", doc="If true, when VCF records overlap the info field is taken from the one with the max AC instead of only taking the fields which are identical across the overlapping records.", required=false)
public boolean MERGE_INFO_WITH_MAX_AC = false;
@Argument(fullName="combineAnnotations", shortName="combineAnnotations", doc="If true, combine the annotation values in some straightforward manner assuming the input callsets are i.i.d.", required=false)
public boolean COMBINE_ANNOTATIONS = false;
private List<String> priority = null;
/** Optimization to strip out genotypes before merging if we are doing a sites_only output */
@ -260,12 +257,9 @@ public class CombineVariants extends RodWalker<Integer, Integer> implements Tree
// get all of the vcf rods at this locus
// Need to provide reference bases to simpleMerge starting at current locus
Collection<VariantContext> vcs = tracker.getValues(variants, context.getLocation());
Collection<VariantContext> potentialRefVCs = tracker.getValues(variants);
potentialRefVCs.removeAll(vcs);
if ( sitesOnlyVCF ) {
vcs = VariantContextUtils.sitesOnlyVariantContexts(vcs);
potentialRefVCs = VariantContextUtils.sitesOnlyVariantContexts(potentialRefVCs);
}
if ( ASSUME_IDENTICAL_SAMPLES ) {
@ -307,17 +301,16 @@ public class CombineVariants extends RodWalker<Integer, Integer> implements Tree
// make sure that it is a variant or in case it is not, that we want to include the sites with no variants
if (!EXCLUDE_NON_VARIANTS || !type.equals(VariantContext.Type.NO_VARIATION)) {
if (VCsByType.containsKey(type)) {
mergedVCs.add(GATKVariantContextUtils.simpleMerge(VCsByType.get(type), potentialRefVCs,
priority, rodNames.size(), filteredRecordsMergeType, genotypeMergeOption, true, printComplexMerges,
SET_KEY, filteredAreUncalled, MERGE_INFO_WITH_MAX_AC, COMBINE_ANNOTATIONS));
mergedVCs.add(GATKVariantContextUtils.simpleMerge(VCsByType.get(type), priority, rodNames.size(),
filteredRecordsMergeType, genotypeMergeOption, true, printComplexMerges,
SET_KEY, filteredAreUncalled, MERGE_INFO_WITH_MAX_AC));
}
}
}
}
else if (multipleAllelesMergeType == GATKVariantContextUtils.MultipleAllelesMergeType.MIX_TYPES) {
mergedVCs.add(GATKVariantContextUtils.simpleMerge(vcs, potentialRefVCs,
priority, rodNames.size(), filteredRecordsMergeType, genotypeMergeOption, true, printComplexMerges,
SET_KEY, filteredAreUncalled, MERGE_INFO_WITH_MAX_AC, COMBINE_ANNOTATIONS));
mergedVCs.add(GATKVariantContextUtils.simpleMerge(vcs, priority, rodNames.size(), filteredRecordsMergeType,
genotypeMergeOption, true, printComplexMerges, SET_KEY, filteredAreUncalled, MERGE_INFO_WITH_MAX_AC));
}
else {
logger.warn("Ignoring all records at site " + ref.getLocus());

View File

@ -790,7 +790,6 @@ public class GATKVariantContextUtils {
* @param setKey the key name of the set
* @param filteredAreUncalled are filtered records uncalled?
* @param mergeInfoWithMaxAC should we merge in info from the VC with maximum allele count?
* @param combineAnnotations should we merge info field annotations by assuming the incoming VCs are i.i.d.
* @return new VariantContext representing the merge of unsortedVCs
*/
public static VariantContext simpleMerge(final Collection<VariantContext> unsortedVCs,
@ -801,10 +800,9 @@ public class GATKVariantContextUtils {
final boolean printMessages,
final String setKey,
final boolean filteredAreUncalled,
final boolean mergeInfoWithMaxAC,
final boolean combineAnnotations ) {
final boolean mergeInfoWithMaxAC ) {
int originalNumOfVCs = priorityListOfVCs == null ? 0 : priorityListOfVCs.size();
return simpleMerge(unsortedVCs, Collections.<VariantContext>emptyList(), priorityListOfVCs, originalNumOfVCs, filteredRecordMergeType, genotypeMergeOptions, annotateOrigin, printMessages, setKey, filteredAreUncalled, mergeInfoWithMaxAC, combineAnnotations);
return simpleMerge(unsortedVCs, priorityListOfVCs, originalNumOfVCs, filteredRecordMergeType, genotypeMergeOptions, annotateOrigin, printMessages, setKey, filteredAreUncalled, mergeInfoWithMaxAC);
}
/**
@ -817,7 +815,6 @@ public class GATKVariantContextUtils {
* For more information on this method see: http://www.thedistractionnetwork.com/programmer-problem/
*
* @param unsortedVCs collection of unsorted VCs
* @param potentialRefVCs collection of unsorted VCs that overlap this locus which should only be searched for potential reference records
* @param priorityListOfVCs priority list detailing the order in which we should grab the VCs
* @param filteredRecordMergeType merge type for filtered records
* @param genotypeMergeOptions merge option for genotypes
@ -826,11 +823,9 @@ public class GATKVariantContextUtils {
* @param setKey the key name of the set
* @param filteredAreUncalled are filtered records uncalled?
* @param mergeInfoWithMaxAC should we merge in info from the VC with maximum allele count?
* @param combineAnnotations should we merge info field annotations by assuming the incoming VCs are i.i.d.
* @return new VariantContext representing the merge of unsortedVCs
*/
public static VariantContext simpleMerge(final Collection<VariantContext> unsortedVCs,
final Collection<VariantContext> potentialRefVCs,
final List<String> priorityListOfVCs,
final int originalNumOfVCs,
final FilteredRecordMergeType filteredRecordMergeType,
@ -839,9 +834,7 @@ public class GATKVariantContextUtils {
final boolean printMessages,
final String setKey,
final boolean filteredAreUncalled,
final boolean mergeInfoWithMaxAC,
final boolean combineAnnotations ) {
final boolean mergeInfoWithMaxAC ) {
if ( unsortedVCs == null || unsortedVCs.size() == 0 )
return null;
@ -860,9 +853,6 @@ public class GATKVariantContextUtils {
VCs.add(vc);
}
// cycle through and fill in NON_REF_SYMBOLIC_ALLELEs with the actual alternate allele if possible
VCs = fillInNonRefSymbolicAlleles(VCs, potentialRefVCs);
if ( VCs.size() == 0 ) // everything is filtered out and we're filteredAreUncalled
return null;
@ -950,38 +940,19 @@ public class GATKVariantContextUtils {
for (final Map.Entry<String, Object> p : vc.getAttributes().entrySet()) {
final String key = p.getKey();
final Object value = p.getValue();
boolean badAnnotation = false;
if ( combineAnnotations ) { // add the annotation values to a list for combining later
List<Comparable> values = annotationMap.get(key);
if( values == null ) {
values = new ArrayList<>();
annotationMap.put(key, values);
}
try {
final String stringValue = value.toString();
// Branch to avoid unintentional, implicit type conversions that occur with the ? operator.
if (stringValue.contains("."))
values.add(Double.parseDouble(stringValue));
else
values.add(Integer.parseInt(stringValue));
} catch (NumberFormatException e) {
badAnnotation = true;
}
}
if ( ! combineAnnotations || badAnnotation ) { // only output annotations that have the same value in every input VC
// if we don't like the key already, don't go anywhere
if ( ! inconsistentAttributes.contains(key) ) {
final boolean alreadyFound = attributes.containsKey(key);
final Object boundValue = attributes.get(key);
final boolean boundIsMissingValue = alreadyFound && boundValue.equals(VCFConstants.MISSING_VALUE_v4);
// only output annotations that have the same value in every input VC
// if we don't like the key already, don't go anywhere
if ( ! inconsistentAttributes.contains(key) ) {
final boolean alreadyFound = attributes.containsKey(key);
final Object boundValue = attributes.get(key);
final boolean boundIsMissingValue = alreadyFound && boundValue.equals(VCFConstants.MISSING_VALUE_v4);
if ( alreadyFound && ! boundValue.equals(value) && ! boundIsMissingValue ) {
// we found the value but we're inconsistent, put it in the exclude list
inconsistentAttributes.add(key);
attributes.remove(key);
} else if ( ! alreadyFound || boundIsMissingValue ) { // no value
attributes.put(key, value);
}
if ( alreadyFound && ! boundValue.equals(value) && ! boundIsMissingValue ) {
// we found the value but we're inconsistent, put it in the exclude list
inconsistentAttributes.add(key);
attributes.remove(key);
} else if ( ! alreadyFound || boundIsMissingValue ) { // no value
attributes.put(key, value);
}
}
}
@ -1007,12 +978,6 @@ public class GATKVariantContextUtils {
// take the VC with the maxAC and pull the attributes into a modifiable map
if ( mergeInfoWithMaxAC && vcWithMaxAC != null ) {
attributesWithMaxAC.putAll(vcWithMaxAC.getAttributes());
} else if ( combineAnnotations ) { // when combining annotations use the median value from all input VCs which had annotations provided
for ( final Map.Entry<String, List<Comparable>> p : annotationMap.entrySet() ) {
if ( ! p.getValue().isEmpty() ) {
attributes.put(p.getKey(), combineAnnotationValues(p.getValue()));
}
}
}
// if at least one record was unfiltered and we want a union, clear all of the filters
@ -1058,11 +1023,6 @@ public class GATKVariantContextUtils {
builder.filters(filters.isEmpty() ? filters : new TreeSet<>(filters));
}
builder.attributes(new TreeMap<>(mergeInfoWithMaxAC ? attributesWithMaxAC : attributes));
if( combineAnnotations ) {
// unfortunately some attributes are just too dangerous to try to combine together
builder.rmAttribute(VCFConstants.MLE_ALLELE_COUNT_KEY);
builder.rmAttribute(VCFConstants.MLE_ALLELE_FREQUENCY_KEY);
}
// Trim the padded bases of all alleles if necessary
final VariantContext merged = builder.make();
@ -1070,69 +1030,202 @@ public class GATKVariantContextUtils {
return merged;
}
private static final Comparable combineAnnotationValues( final List<Comparable> array ) {
private static Comparable combineAnnotationValues( final List<Comparable> array ) {
return MathUtils.median(array); // right now we take the median but other options could be explored
}
/**
* cycle through and fill in NON_REF_SYMBOLIC_ALLELEs with the actual alternate allele if possible
* @param VCs the list of VCs in which to fill in symbolic alleles
* @param potentialRefVCs the list of VCs which are overlapping the current locus-- need to look for reference blocks and fill in with alternate alleles
* @return the list of VCs to merge in which all the NON_REF_SYMBOLIC_ALLELEs have been replaced with the correct alternate allele
* Merges VariantContexts from gVCFs into a single hybrid.
* Assumes that none of the input records are filtered.
*
* @param VCs collection of unsorted genomic VCs
* @param loc the current location
* @param refBase the reference allele to use if all contexts in the VC are spanning (i.e. don't start at the location in loc); if null, we'll return null in this case
* @return new VariantContext representing the merge of all VCs or null if it not relevant
*/
protected static final List<VariantContext> fillInNonRefSymbolicAlleles( final List<VariantContext> VCs, final Collection<VariantContext> potentialRefVCs ) {
if( VCs == null ) { throw new IllegalArgumentException("VCs cannot be null"); }
if( potentialRefVCs == null ) { throw new IllegalArgumentException("potentialRefVCs cannot be null"); }
public static VariantContext referenceConfidenceMerge(final List<VariantContext> VCs, final GenomeLoc loc, final Byte refBase) {
final List<VariantContext> VCsToReturn = new ArrayList<>(VCs.size());
boolean containsNonRefSymbolicAllele = false;
VariantContext nonRefVC = null;
for( final VariantContext vc : VCs ) {
if( vc.hasAllele(NON_REF_SYMBOLIC_ALLELE) ) {
containsNonRefSymbolicAllele = true;
} else if ( nonRefVC == null ) {
nonRefVC = vc;
}
if( nonRefVC != null && containsNonRefSymbolicAllele == true ) {
break; // break out so that we don't run over the whole list unnecessarily
}
if ( VCs == null || VCs.size() == 0 ) throw new IllegalArgumentException("VCs cannot be null or empty");
// establish the baseline info (sometimes from the first VC)
final VariantContext first = VCs.get(0);
final String name = first.getSource();
// ref allele
final Allele refAllele = determineReferenceAlleleGiveReferenceBase(VCs, loc, refBase);
if ( refAllele == null )
return null;
// alt alleles
final AlleleMapper alleleMapper = determineAlternateAlleleMapping(VCs, refAllele, loc);
final List<Allele> alleles = getAllelesListFromMapper(refAllele, alleleMapper);
final Map<String, Object> attributes = new LinkedHashMap<>();
final Set<String> inconsistentAttributes = new HashSet<>();
final Set<String> rsIDs = new LinkedHashSet<>(1); // most of the time there's one id
VariantContext longestVC = first;
int depth = 0;
final Map<String, List<Comparable>> annotationMap = new LinkedHashMap<>();
GenotypesContext genotypes = GenotypesContext.create();
// cycle through and add info from the other VCs
for ( final VariantContext vc : VCs ) {
// if this context doesn't start at the current location then it must be a spanning event (deletion or ref block)
final boolean isSpanningEvent = loc.getStart() != vc.getStart();
final List<Allele> remappedAlleles = isSpanningEvent ? replaceWithNoCalls(vc.getAlleles()) : alleleMapper.remap(vc.getAlleles());
mergeRefConfidenceGenotypes(genotypes, vc, remappedAlleles, alleles);
// special case DP (add it up) for all events
if ( vc.hasAttribute(VCFConstants.DEPTH_KEY) )
depth += vc.getAttributeAsInt(VCFConstants.DEPTH_KEY, 0);
if ( isSpanningEvent )
continue;
// keep track of the longest location that starts here
if ( VariantContextUtils.getSize(vc) > VariantContextUtils.getSize(longestVC) )
longestVC = vc;
// special case ID (just preserve it)
if ( vc.hasID() ) rsIDs.add(vc.getID());
// add attributes
addReferenceConfidenceAttributes(vc.getAttributes(), attributes, inconsistentAttributes, annotationMap);
}
for( final VariantContext vc : potentialRefVCs ) {
if( vc.hasAllele(NON_REF_SYMBOLIC_ALLELE) ) {
containsNonRefSymbolicAllele = true;
VCs.add(vc); // add the overlapping non-ref symbolic records to the VCs list in order to be filled in below
// when combining annotations use the median value from all input VCs which had annotations provided
for ( final Map.Entry<String, List<Comparable>> p : annotationMap.entrySet() ) {
if ( ! p.getValue().isEmpty() ) {
attributes.put(p.getKey(), combineAnnotationValues(p.getValue()));
}
}
if( !containsNonRefSymbolicAllele ) {
return VCs;
}
if ( depth > 0 )
attributes.put(VCFConstants.DEPTH_KEY, String.valueOf(depth));
for( final VariantContext vc : VCs ) {
if( vc.hasAllele(NON_REF_SYMBOLIC_ALLELE) ) { // create a new record based on the current record but instead has the symbolic allele replaced by the alternate allele for this site
if( nonRefVC != null ) {
final GenotypesContext genotypes = GenotypesContext.create(vc.getSampleNames().size());
int depth = 0;
for( final String sample : vc.getSampleNames() ) {
final Genotype gt = vc.getGenotype(sample);
final ArrayList<Allele> refAlleles = new ArrayList<>(2);
refAlleles.add(nonRefVC.getReference());
refAlleles.add(nonRefVC.getReference());
final int[] pl = ( nonRefVC.isBiallelic() ? gt.getPL() : null ); // PLs only works for biallelic sites for now
depth += ( gt.hasDP() ? gt.getDP() : Integer.parseInt((String)gt.getAnyAttribute("MIN_DP")) ); // DP is special-cased in CombineVariants so fill it in here
genotypes.add(new GenotypeBuilder(gt).alleles(refAlleles).PL(pl).make());
}
VCsToReturn.add(new VariantContextBuilder(nonRefVC).attributes(null).attribute("DP", depth).genotypes(genotypes).make());
}
} else {
VCsToReturn.add(vc);
}
}
final String ID = rsIDs.isEmpty() ? VCFConstants.EMPTY_ID_FIELD : Utils.join(",", rsIDs);
return VCsToReturn;
final VariantContextBuilder builder = new VariantContextBuilder().source(name).id(ID).alleles(alleles)
.loc(longestVC.getChr(), longestVC.getStart(), longestVC.getEnd())
.genotypes(genotypes).unfiltered().attributes(new TreeMap<>(attributes)).log10PError(CommonInfo.NO_LOG10_PERROR); // we will need to regenotype later
// remove stale AC and AF based attributes
removeStaleAttributesAfterMerge(builder);
return builder.make();
}
private static final boolean hasPLIncompatibleAlleles(final Collection<Allele> alleleSet1, final Collection<Allele> alleleSet2) {
/**
* Determines the ref allele given the provided reference base at this position
*
* @param VCs collection of unsorted genomic VCs
* @param loc the current location
* @param refBase the reference allele to use if all contexts in the VC are spanning
* @return new Allele or null if no reference allele/base is available
*/
private static Allele determineReferenceAlleleGiveReferenceBase(final List<VariantContext> VCs, final GenomeLoc loc, final Byte refBase) {
final Allele refAllele = determineReferenceAllele(VCs, loc);
if ( refAllele == null )
return ( refBase == null ? null : Allele.create(refBase, true) );
return refAllele;
}
/**
* Creates an alleles list given a reference allele and a mapper
*
* @param refAllele the reference allele
* @param alleleMapper the allele mapper
* @return a non-null, non-empty list of Alleles
*/
private static List<Allele> getAllelesListFromMapper(final Allele refAllele, final AlleleMapper alleleMapper) {
final List<Allele> alleles = new ArrayList<>();
alleles.add(refAllele);
alleles.addAll(alleleMapper.getUniqueMappedAlleles());
return alleles;
}
/**
* Remove the stale attributes from the merged VariantContext (builder)
*
* @param builder the VC builder
*/
private static void removeStaleAttributesAfterMerge(final VariantContextBuilder builder) {
builder.rmAttribute(VCFConstants.ALLELE_COUNT_KEY);
builder.rmAttribute(VCFConstants.ALLELE_FREQUENCY_KEY);
builder.rmAttribute(VCFConstants.ALLELE_NUMBER_KEY);
builder.rmAttribute(VCFConstants.MLE_ALLELE_COUNT_KEY);
builder.rmAttribute(VCFConstants.MLE_ALLELE_FREQUENCY_KEY);
}
/**
* Adds attributes to the global map from the new context in a sophisticated manner
*
* @param myAttributes attributes to add from
* @param globalAttributes global set of attributes to add to
* @param inconsistentAttributes set of attributes that are inconsistent among samples
* @param annotationMap map of annotations for combining later
*/
private static void addReferenceConfidenceAttributes(final Map<String, Object> myAttributes,
final Map<String, Object> globalAttributes,
final Set<String> inconsistentAttributes,
final Map<String, List<Comparable>> annotationMap) {
for ( final Map.Entry<String, Object> p : myAttributes.entrySet() ) {
final String key = p.getKey();
final Object value = p.getValue();
boolean badAnnotation = false;
// add the annotation values to a list for combining later
List<Comparable> values = annotationMap.get(key);
if( values == null ) {
values = new ArrayList<>();
annotationMap.put(key, values);
}
try {
final String stringValue = value.toString();
// Branch to avoid unintentional, implicit type conversions that occur with the ? operator.
if (stringValue.contains("."))
values.add(Double.parseDouble(stringValue));
else
values.add(Integer.parseInt(stringValue));
} catch (final NumberFormatException e) {
badAnnotation = true;
}
// only output annotations that have the same value in every input VC
if ( badAnnotation && ! inconsistentAttributes.contains(key) ) {
checkForConsistency(key, value, globalAttributes, inconsistentAttributes);
}
}
}
/**
* Check attributes for consistency to others in the merge
*
* @param key the attribute key
* @param value the attribute value
* @param globalAttributes the global list of attributes being merged
* @param inconsistentAttributes the list of inconsistent attributes in the merge
*/
private static void checkForConsistency(final String key,
final Object value,
final Map<String, Object> globalAttributes,
final Set<String> inconsistentAttributes) {
final boolean alreadyFound = globalAttributes.containsKey(key);
final Object boundValue = globalAttributes.get(key);
final boolean boundIsMissingValue = alreadyFound && boundValue.equals(VCFConstants.MISSING_VALUE_v4);
if ( alreadyFound && ! boundValue.equals(value) && ! boundIsMissingValue ) {
// we found the value but we're inconsistent, put it in the exclude list
inconsistentAttributes.add(key);
globalAttributes.remove(key);
} else if ( ! alreadyFound || boundIsMissingValue ) { // no value
globalAttributes.put(key, value);
}
}
private static boolean hasPLIncompatibleAlleles(final Collection<Allele> alleleSet1, final Collection<Allele> alleleSet2) {
final Iterator<Allele> it1 = alleleSet1.iterator();
final Iterator<Allele> it2 = alleleSet2.iterator();
@ -1262,65 +1355,130 @@ public class GATKVariantContextUtils {
return builder.make();
}
static private Allele determineReferenceAllele(List<VariantContext> VCs) {
static private Allele determineReferenceAllele(final List<VariantContext> VCs) {
return determineReferenceAllele(VCs, null);
}
/**
* Determines the common reference allele
*
* @param VCs the list of VariantContexts
* @param loc if not null, ignore records that do not begin at this start location
* @return possibly null Allele
*/
static private Allele determineReferenceAllele(final List<VariantContext> VCs, final GenomeLoc loc) {
Allele ref = null;
for ( final VariantContext vc : VCs ) {
final Allele myRef = vc.getReference();
if ( ref == null || ref.length() < myRef.length() )
ref = myRef;
else if ( ref.length() == myRef.length() && ! ref.equals(myRef) )
throw new TribbleException(String.format("The provided variant file(s) have inconsistent references for the same position(s) at %s:%d, %s vs. %s", vc.getChr(), vc.getStart(), ref, myRef));
if ( contextMatchesLoc(vc, loc) ) {
final Allele myRef = vc.getReference();
if ( ref == null || ref.length() < myRef.length() )
ref = myRef;
else if ( ref.length() == myRef.length() && ! ref.equals(myRef) )
throw new TribbleException(String.format("The provided variant file(s) have inconsistent references for the same position(s) at %s:%d, %s vs. %s", vc.getChr(), vc.getStart(), ref, myRef));
}
}
return ref;
}
static private AlleleMapper resolveIncompatibleAlleles(Allele refAllele, VariantContext vc, Set<Allele> allAlleles) {
static private boolean contextMatchesLoc(final VariantContext vc, final GenomeLoc loc) {
return loc == null || loc.getStart() == vc.getStart();
}
/**
* Given the reference allele, determines the mapping for common alternate alleles in the list of VariantContexts.
*
* @param VCs the list of VariantContexts
* @param refAllele the reference allele
* @param loc if not null, ignore records that do not begin at this start location
* @return non-null AlleleMapper
*/
static private AlleleMapper determineAlternateAlleleMapping(final List<VariantContext> VCs, final Allele refAllele, final GenomeLoc loc) {
final Map<Allele, Allele> map = new HashMap<>();
for ( final VariantContext vc : VCs ) {
if ( contextMatchesLoc(vc, loc) )
addAllAlternateAllelesToMap(vc, refAllele, map);
}
return new AlleleMapper(map);
}
/**
* Adds all of the alternate alleles from the VariantContext to the allele mapping (for use in creating the AlleleMapper)
*
* @param vc the VariantContext
* @param refAllele the reference allele
* @param map the allele mapping to populate
*/
static private void addAllAlternateAllelesToMap(final VariantContext vc, final Allele refAllele, final Map<Allele, Allele> map) {
// if the ref allele matches, then just add the alts as is
if ( refAllele.equals(vc.getReference()) ) {
for ( final Allele altAllele : vc.getAlternateAlleles() ) {
// ignore symbolic alleles
if ( ! altAllele.isSymbolic() )
map.put(altAllele, altAllele);
}
}
else {
map.putAll(createAlleleMapping(refAllele, vc, map.values()));
}
}
static private AlleleMapper resolveIncompatibleAlleles(final Allele refAllele, final VariantContext vc, final Set<Allele> allAlleles) {
if ( refAllele.equals(vc.getReference()) )
return new AlleleMapper(vc);
else {
// we really need to do some work. The refAllele is the longest reference allele seen at this
// start site. So imagine it is:
//
// refAllele: ACGTGA
// myRef: ACGT
// myAlt: A
//
// We need to remap all of the alleles in vc to include the extra GA so that
// myRef => refAllele and myAlt => AGA
//
Allele myRef = vc.getReference();
if ( refAllele.length() <= myRef.length() ) throw new IllegalStateException("BUG: myRef="+myRef+" is longer than refAllele="+refAllele);
byte[] extraBases = Arrays.copyOfRange(refAllele.getBases(), myRef.length(), refAllele.length());
// System.out.printf("Remapping allele at %s%n", vc);
// System.out.printf("ref %s%n", refAllele);
// System.out.printf("myref %s%n", myRef );
// System.out.printf("extrabases %s%n", new String(extraBases));
Map<Allele, Allele> map = new HashMap<>();
for ( final Allele a : vc.getAlleles() ) {
if ( a.isReference() )
map.put(a, refAllele);
else {
Allele extended = Allele.extend(a, extraBases);
for ( final Allele b : allAlleles )
if ( extended.equals(b) )
extended = b;
// System.out.printf(" Extending %s => %s%n", a, extended);
map.put(a, extended);
}
}
// debugging
// System.out.printf("mapping %s%n", map);
final Map<Allele, Allele> map = createAlleleMapping(refAllele, vc, allAlleles);
map.put(vc.getReference(), refAllele);
return new AlleleMapper(map);
}
}
/**
* Create an allele mapping for the given context where its reference allele must (potentially) be extended to the given allele
*
* The refAllele is the longest reference allele seen at this start site.
* So imagine it is:
* refAllele: ACGTGA
* myRef: ACGT
* myAlt: A
*
* We need to remap all of the alleles in vc to include the extra GA so that
* myRef => refAllele and myAlt => AGA
*
* @param refAllele the new (extended) reference allele
* @param oneVC the Variant Context to extend
* @param currentAlleles the list of alleles already created
* @return a non-null mapping of original alleles to new (extended) ones
*/
private static Map<Allele, Allele> createAlleleMapping(final Allele refAllele,
final VariantContext oneVC,
final Collection<Allele> currentAlleles) {
final Allele myRef = oneVC.getReference();
if ( refAllele.length() <= myRef.length() ) throw new IllegalStateException("BUG: myRef="+myRef+" is longer than refAllele="+refAllele);
final byte[] extraBases = Arrays.copyOfRange(refAllele.getBases(), myRef.length(), refAllele.length());
final Map<Allele, Allele> map = new HashMap<>();
for ( final Allele a : oneVC.getAlternateAlleles() ) {
if ( isUsableAlternateAllele(a) ) {
Allele extended = Allele.extend(a, extraBases);
for ( final Allele b : currentAlleles )
if ( extended.equals(b) )
extended = b;
map.put(a, extended);
}
}
return map;
}
static private boolean isUsableAlternateAllele(final Allele allele) {
return ! (allele.isReference() || allele.isSymbolic() );
}
public static List<VariantContext> sortVariantContextsByPriority(Collection<VariantContext> unsortedVCs, List<String> priorityListOfVCs, GenotypeMergeType mergeOption ) {
if ( mergeOption == GenotypeMergeType.PRIORITIZE && priorityListOfVCs == null )
throw new IllegalArgumentException("Cannot merge calls by priority with a null priority list");
@ -1352,6 +1510,124 @@ public class GATKVariantContextUtils {
}
}
/**
* Replaces any alleles in the list with NO CALLS, except for the generic ALT allele
*
* @param alleles list of alleles to replace
* @return non-null list of alleles
*/
private static List<Allele> replaceWithNoCalls(final List<Allele> alleles) {
if ( alleles == null ) throw new IllegalArgumentException("list of alleles cannot be null");
final List<Allele> result = new ArrayList<>(alleles.size());
for ( final Allele allele : alleles )
result.add(allele.equals(NON_REF_SYMBOLIC_ALLELE) ? allele : Allele.NO_CALL);
return result;
}
/**
* Merge into the context a new genotype represented by the given VariantContext for the provided list of target alleles.
* This method assumes that none of the alleles in the VC overlaps with any of the alleles in the set.
*
* @param mergedGenotypes the genotypes context to add to
* @param VC the Variant Context for the sample
* @param remappedAlleles the list of remapped alleles for the sample
* @param targetAlleles the list of target alleles
*/
private static void mergeRefConfidenceGenotypes(final GenotypesContext mergedGenotypes,
final VariantContext VC,
final List<Allele> remappedAlleles,
final List<Allele> targetAlleles) {
for ( final Genotype g : VC.getGenotypes() ) {
// only add if the name is new
final String name = g.getSampleName();
if ( !mergedGenotypes.containsSample(name) ) {
// we need to modify it even if it already contains all of the alleles because we need to purge the <ALT> PLs out anyways
final int[] indexesOfRelevantAlleles = getIndexesOfRelevantAlleles(remappedAlleles, targetAlleles);
final int[] PLs = generatePLs(g, indexesOfRelevantAlleles);
// note that we set the alleles to null here (as we expect it to be re-genotyped)
final Genotype newG = new GenotypeBuilder(g).name(name).alleles(null).PL(PLs).noAD().noGQ().make();
mergedGenotypes.add(newG);
}
}
}
/**
* Determines the allele mapping from myAlleles to the targetAlleles, substituting the generic "<ALT>" as appropriate.
* If the myAlleles set does not contain "<ALT>" as an allele, it throws an exception.
*
* @param remappedAlleles the list of alleles to evaluate
* @param targetAlleles the target list of alleles
* @return non-null array of ints representing indexes
*/
protected static int[] getIndexesOfRelevantAlleles(final List<Allele> remappedAlleles, final List<Allele> targetAlleles) {
if ( remappedAlleles == null || remappedAlleles.size() == 0 ) throw new IllegalArgumentException("The list of input alleles must not be null or empty");
if ( targetAlleles == null || targetAlleles.size() == 0 ) throw new IllegalArgumentException("The list of target alleles must not be null or empty");
if ( !remappedAlleles.contains(NON_REF_SYMBOLIC_ALLELE) )
throw new IllegalArgumentException("The list of input alleles must contain " + NON_REF_SYMBOLIC_ALLELE + " as an allele; please use the Haplotype Caller with gVCF output to generate appropriate records");
final int indexOfGenericAlt = remappedAlleles.indexOf(NON_REF_SYMBOLIC_ALLELE);
final int[] indexMapping = new int[targetAlleles.size()];
// the reference alleles always match up (even if they don't appear to)
indexMapping[0] = 0;
// create the index mapping, using the <ALT> allele whenever such a mapping doesn't exist
for ( int i = 1; i < targetAlleles.size(); i++ ) {
final int indexOfRemappedAllele = remappedAlleles.indexOf(targetAlleles.get(i));
indexMapping[i] = indexOfRemappedAllele == -1 ? indexOfGenericAlt: indexOfRemappedAllele;
}
return indexMapping;
}
/**
* Generates new PLs given the set of indexes of the Genotype's current alleles from the original PLs.
* Throws an exception if the Genotype does not contain PLs.
*
* @param genotype the genotype from which to grab PLs
* @param indexesOfRelevantAlleles the indexes of the original alleles corresponding to the new alleles
* @return non-null array of new PLs
*/
protected static int[] generatePLs(final Genotype genotype, final int[] indexesOfRelevantAlleles) {
if ( !genotype.hasPL() )
throw new IllegalArgumentException("Cannot generate new PLs from a genotype without PLs");
final int[] originalPLs = genotype.getPL();
// assume diploid
final int numLikelihoods = GenotypeLikelihoods.numLikelihoods(indexesOfRelevantAlleles.length, 2);
final int[] newPLs = new int[numLikelihoods];
for ( int i = 0; i < indexesOfRelevantAlleles.length; i++ ) {
for ( int j = i; j < indexesOfRelevantAlleles.length; j++ ) {
final int originalPLindex = calculatePLindexFromUnorderedIndexes(indexesOfRelevantAlleles[i], indexesOfRelevantAlleles[j]);
if ( originalPLindex >= originalPLs.length )
throw new IllegalStateException("The original PLs do not have enough values; accessing index " + originalPLindex + " but size is " + originalPLs.length);
final int newPLindex = GenotypeLikelihoods.calculatePLindex(i, j);
newPLs[newPLindex] = originalPLs[originalPLindex];
}
}
return newPLs;
}
/**
* This is just a safe wrapper around GenotypeLikelihoods.calculatePLindex()
*
* @param originalIndex1 the index of the first allele
* @param originalIndex2 the index of the second allele
* @return the PL index
*/
protected static int calculatePLindexFromUnorderedIndexes(final int originalIndex1, final int originalIndex2) {
// we need to make sure they are ordered correctly
return ( originalIndex2 < originalIndex1 ) ? GenotypeLikelihoods.calculatePLindex(originalIndex2, originalIndex1) : GenotypeLikelihoods.calculatePLindex(originalIndex1, originalIndex2);
}
public static String mergedSampleName(String trackName, String sampleName, boolean uniquify ) {
return uniquify ? sampleName + "." + trackName : sampleName;
}
@ -1717,6 +1993,15 @@ public class GATKVariantContextUtils {
}
return newAs;
}
/**
* @return the list of unique values
*/
public List<Allele> getUniqueMappedAlleles() {
if ( map == null )
return Collections.emptyList();
return new ArrayList<>(new HashSet<>(map.values()));
}
}
private static class CompareByPriority implements Comparator<VariantContext>, Serializable {

View File

@ -27,12 +27,9 @@ package org.broadinstitute.sting.utils.variant;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.variant.variantcontext.*;
import org.broadinstitute.variant.vcf.VCFConstants;
import org.testng.Assert;
import org.testng.annotations.BeforeSuite;
import org.testng.annotations.DataProvider;
@ -182,7 +179,7 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest {
final VariantContext merged = GATKVariantContextUtils.simpleMerge(
inputs, priority,
GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED,
GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, false, false, "set", false, false, false);
GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, false, false, "set", false, false);
Assert.assertEquals(merged.getAlleles(), cfg.expected);
}
@ -240,7 +237,7 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest {
final VariantContext merged = GATKVariantContextUtils.simpleMerge(
inputs, null,
GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED,
GATKVariantContextUtils.GenotypeMergeType.UNSORTED, false, false, "set", false, false, false);
GATKVariantContextUtils.GenotypeMergeType.UNSORTED, false, false, "set", false, false);
Assert.assertEquals(merged.getID(), cfg.expected);
}
@ -355,7 +352,7 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest {
public void testMergeFiltered(MergeFilteredTest cfg) {
final List<String> priority = vcs2priority(cfg.inputs);
final VariantContext merged = GATKVariantContextUtils.simpleMerge(
cfg.inputs, priority, cfg.type, GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, true, false, "set", false, false, false);
cfg.inputs, priority, cfg.type, GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, true, false, "set", false, false);
// test alleles are equal
Assert.assertEquals(merged.getAlleles(), cfg.expected.getAlleles());
@ -482,7 +479,7 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest {
public void testMergeGenotypes(MergeGenotypesTest cfg) {
final VariantContext merged = GATKVariantContextUtils.simpleMerge(
cfg.inputs, cfg.priority, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED,
GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, true, false, "set", false, false, false);
GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, true, false, "set", false, false);
// test alleles are equal
Assert.assertEquals(merged.getAlleles(), cfg.expected.getAlleles());
@ -523,7 +520,7 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest {
final VariantContext merged = GATKVariantContextUtils.simpleMerge(
Arrays.asList(vc1, vc2), null, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED,
GATKVariantContextUtils.GenotypeMergeType.UNIQUIFY, false, false, "set", false, false, false);
GATKVariantContextUtils.GenotypeMergeType.UNIQUIFY, false, false, "set", false, false);
// test genotypes
Assert.assertEquals(merged.getSampleNames(), new HashSet<>(Arrays.asList("s1.1", "s1.2")));
@ -556,7 +553,7 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest {
final VariantContext merged = GATKVariantContextUtils.simpleMerge(
Arrays.asList(vc1, vc2), priority, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED,
GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, annotate, false, set, false, false, false);
GATKVariantContextUtils.GenotypeMergeType.PRIORITIZE, annotate, false, set, false, false);
if ( annotate )
Assert.assertEquals(merged.getAttribute(set), GATKVariantContextUtils.MERGE_INTERSECTION);
@ -1036,26 +1033,6 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest {
}
}
@Test(enabled = !DEBUG)
public void testFillInNonRefSymbolicAlleles() {
final int start = 10;
final String ref = "A";
final String alt = "C";
final VariantContext vcAlt = GATKVariantContextUtils.makeFromAlleles("test", "20", start, Arrays.asList(ref, alt));
final VariantContext vcRef = GATKVariantContextUtils.makeFromAlleles("test", "20", start, Arrays.asList(ref, "<"+GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE_NAME+">"));
List<VariantContext> VCs = Arrays.asList(vcAlt, vcRef);
VCs = GATKVariantContextUtils.fillInNonRefSymbolicAlleles(VCs, Collections.<VariantContext>emptyList());
// make sure the non ref symbolic alleles have all been filled in with the appropriate alternate allele
for( final VariantContext vc : VCs ) {
Assert.assertTrue(vc.getAlternateAlleles().size() == 1);
Assert.assertTrue(vc.getAlternateAllele(0).isNonReference());
Assert.assertTrue(!vc.getReference().isSymbolic());
Assert.assertTrue(!vc.getAlternateAllele(0).isSymbolic());
}
}
// --------------------------------------------------------------------------------
//
// test allele remapping
@ -1428,4 +1405,225 @@ public class GATKVariantContextUtilsUnitTest extends BaseTest {
assertGenotypesAreEqual(actualGT, expected);
}
}
// --------------------------------------------------------------------------------
//
// Test methods for merging reference confidence VCs
//
// --------------------------------------------------------------------------------
@DataProvider(name = "generatePLsData")
public Object[][] makeGeneratePLsData() {
final List<Object[]> tests = new ArrayList<>();
for ( int originalAlleles = 2; originalAlleles <= 5; originalAlleles++ ) {
for ( int swapPosition1 = 0; swapPosition1 < originalAlleles; swapPosition1++ ) {
for ( int swapPosition2 = swapPosition1+1; swapPosition2 < originalAlleles; swapPosition2++ ) {
final int[] indexes = new int[originalAlleles];
for ( int i = 0; i < originalAlleles; i++ )
indexes[i] = i;
indexes[swapPosition1] = swapPosition2;
indexes[swapPosition2] = swapPosition1;
tests.add(new Object[]{originalAlleles, indexes});
}
}
}
return tests.toArray(new Object[][]{});
}
@Test(dataProvider = "generatePLsData")
public void testGeneratePLs(final int numOriginalAlleles, final int[] indexOrdering) {
final int numLikelihoods = GenotypeLikelihoods.numLikelihoods(numOriginalAlleles, 2);
final int[] PLs = new int[numLikelihoods];
for ( int i = 0; i < numLikelihoods; i++ )
PLs[i] = i;
final List<Allele> alleles = new ArrayList<>(numOriginalAlleles);
alleles.add(Allele.create("A", true));
for ( int i = 1; i < numOriginalAlleles; i++ )
alleles.add(Allele.create(Utils.dupString('A', i + 1), false));
final Genotype genotype = new GenotypeBuilder("foo", alleles).PL(PLs).make();
final int[] newPLs = GATKVariantContextUtils.generatePLs(genotype, indexOrdering);
Assert.assertEquals(newPLs.length, numLikelihoods);
final int[] expectedPLs = new int[numLikelihoods];
for ( int i = 0; i < numOriginalAlleles; i++ ) {
for ( int j = i; j < numOriginalAlleles; j++ ) {
final int index = GenotypeLikelihoods.calculatePLindex(i, j);
final int value = GATKVariantContextUtils.calculatePLindexFromUnorderedIndexes(indexOrdering[i], indexOrdering[j]);
expectedPLs[index] = value;
}
}
for ( int i = 0; i < numLikelihoods; i++ ) {
Assert.assertEquals(newPLs[i], expectedPLs[i]);
}
}
@Test(expectedExceptions = IllegalArgumentException.class)
public void testGetIndexesOfRelevantAllelesWithNoALT() {
final List<Allele> alleles1 = new ArrayList<>(1);
alleles1.add(Allele.create("A", true));
final List<Allele> alleles2 = new ArrayList<>(1);
alleles2.add(Allele.create("A", true));
GATKVariantContextUtils.getIndexesOfRelevantAlleles(alleles1, alleles2);
Assert.fail("We should have thrown an exception because the <ALT> allele was not present");
}
@DataProvider(name = "getIndexesOfRelevantAllelesData")
public Object[][] makeGetIndexesOfRelevantAllelesData() {
final int totalAlleles = 5;
final List<Allele> alleles = new ArrayList<>(totalAlleles);
alleles.add(Allele.create("A", true));
for ( int i = 1; i < totalAlleles; i++ )
alleles.add(Allele.create(Utils.dupString('A', i + 1), false));
final List<Object[]> tests = new ArrayList<>();
for ( int alleleIndex = 0; alleleIndex < totalAlleles; alleleIndex++ ) {
tests.add(new Object[]{alleleIndex, alleles});
}
return tests.toArray(new Object[][]{});
}
@Test(dataProvider = "getIndexesOfRelevantAllelesData")
public void testGetIndexesOfRelevantAlleles(final int allelesIndex, final List<Allele> allAlleles) {
final List<Allele> myAlleles = new ArrayList<>(3);
// always add the reference and <ALT> alleles
myAlleles.add(allAlleles.get(0));
myAlleles.add(GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE);
// optionally add another alternate allele
if ( allelesIndex > 0 )
myAlleles.add(allAlleles.get(allelesIndex));
final int[] indexes = GATKVariantContextUtils.getIndexesOfRelevantAlleles(myAlleles, allAlleles);
Assert.assertEquals(indexes.length, allAlleles.size());
for ( int i = 0; i < allAlleles.size(); i++ ) {
if ( i == 0 )
Assert.assertEquals(indexes[i], 0); // ref should always match
else if ( i == allelesIndex )
Assert.assertEquals(indexes[i], 2); // allele
else
Assert.assertEquals(indexes[i], 1); // <ALT>
}
}
@DataProvider(name = "referenceConfidenceMergeData")
public Object[][] makeReferenceConfidenceMergeData() {
final List<Object[]> tests = new ArrayList<>();
final int start = 10;
final GenomeLoc loc = new UnvalidatingGenomeLoc("20", 0, start, start);
final VariantContext VCbase = new VariantContextBuilder("test", "20", start, start, Arrays.asList(Aref)).make();
final VariantContext VCprevBase = new VariantContextBuilder("test", "20", start-1, start-1, Arrays.asList(Aref)).make();
final int[] standardPLs = new int[]{30, 20, 10, 71, 72, 73};
final int[] reorderedSecondAllelePLs = new int[]{30, 71, 73, 20, 72, 10};
final List<Allele> A_ALT = Arrays.asList(Aref, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE);
final Genotype gA_ALT = new GenotypeBuilder("A").PL(new int[]{0, 100, 1000}).make();
final VariantContext vcA_ALT = new VariantContextBuilder(VCbase).alleles(A_ALT).genotypes(gA_ALT).make();
final Allele AAref = Allele.create("AA", true);
final List<Allele> AA_ALT = Arrays.asList(AAref, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE);
final Genotype gAA_ALT = new GenotypeBuilder("AA").PL(new int[]{0, 80, 800}).make();
final VariantContext vcAA_ALT = new VariantContextBuilder(VCprevBase).alleles(AA_ALT).genotypes(gAA_ALT).make();
final List<Allele> A_C = Arrays.asList(Aref, C);
final Genotype gA_C = new GenotypeBuilder("A_C").PL(new int[]{30, 20, 10}).make();
final List<Allele> A_C_ALT = Arrays.asList(Aref, C, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE);
final Genotype gA_C_ALT = new GenotypeBuilder("A_C").PL(standardPLs).make();
final VariantContext vcA_C_ALT = new VariantContextBuilder(VCbase).alleles(A_C_ALT).genotypes(gA_C_ALT).make();
final List<Allele> A_G_ALT = Arrays.asList(Aref, G, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE);
final Genotype gA_G_ALT = new GenotypeBuilder("A_G").PL(standardPLs).make();
final VariantContext vcA_G_ALT = new VariantContextBuilder(VCbase).alleles(A_G_ALT).genotypes(gA_G_ALT).make();
final List<Allele> A_C_G = Arrays.asList(Aref, C, G);
final Genotype gA_C_G = new GenotypeBuilder("A_C_G").PL(new int[]{40, 20, 30, 20, 10, 30}).make();
final List<Allele> A_C_G_ALT = Arrays.asList(Aref, C, G, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE);
final Genotype gA_C_G_ALT = new GenotypeBuilder("A_C_G").PL(new int[]{40, 20, 30, 20, 10, 30, 71, 72, 73, 74}).make();
final VariantContext vcA_C_G_ALT = new VariantContextBuilder(VCbase).alleles(A_C_G_ALT).genotypes(gA_C_G_ALT).make();
final List<Allele> A_ATC_ALT = Arrays.asList(Aref, ATC, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE);
final Genotype gA_ATC_ALT = new GenotypeBuilder("A_ATC").PL(standardPLs).make();
final VariantContext vcA_ATC_ALT = new VariantContextBuilder(VCbase).alleles(A_ATC_ALT).genotypes(gA_ATC_ALT).make();
final Allele A = Allele.create("A", false);
final List<Allele> AA_A_ALT = Arrays.asList(AAref, A, GATKVariantContextUtils.NON_REF_SYMBOLIC_ALLELE);
final Genotype gAA_A_ALT = new GenotypeBuilder("AA_A").PL(standardPLs).make();
final VariantContext vcAA_A_ALT = new VariantContextBuilder(VCprevBase).alleles(AA_A_ALT).genotypes(gAA_A_ALT).make();
// first test the case of a single record
tests.add(new Object[]{Arrays.asList(vcA_C_ALT),
loc, false,
new VariantContextBuilder(VCbase).alleles(A_C).genotypes(gA_C).make()});
// now, test pairs:
// a SNP with another SNP
tests.add(new Object[]{Arrays.asList(vcA_C_ALT, vcA_G_ALT),
loc, false,
new VariantContextBuilder(VCbase).alleles(A_C_G).genotypes(gA_C_ALT, new GenotypeBuilder("A_G").PL(reorderedSecondAllelePLs).make()).make()});
// a SNP with an indel
tests.add(new Object[]{Arrays.asList(vcA_C_ALT, vcA_ATC_ALT),
loc, false,
new VariantContextBuilder(VCbase).alleles(Arrays.asList(Aref, C, ATC)).genotypes(gA_C_ALT, new GenotypeBuilder("A_ATC").PL(reorderedSecondAllelePLs).make()).make()});
// a SNP with 2 SNPs
tests.add(new Object[]{Arrays.asList(vcA_C_ALT, vcA_C_G_ALT),
loc, false,
new VariantContextBuilder(VCbase).alleles(A_C_G).genotypes(gA_C_ALT, gA_C_G).make()});
// a SNP with a ref record
tests.add(new Object[]{Arrays.asList(vcA_C_ALT, vcA_ALT),
loc, false,
new VariantContextBuilder(VCbase).alleles(A_C).genotypes(gA_C, gA_ALT).make()});
// spanning records:
// a SNP with a spanning ref record
tests.add(new Object[]{Arrays.asList(vcA_C_ALT, vcAA_ALT),
loc, false,
new VariantContextBuilder(VCbase).alleles(A_C).genotypes(gA_C, gAA_ALT).make()});
// a SNP with a spanning deletion
tests.add(new Object[]{Arrays.asList(vcA_C_ALT, vcAA_A_ALT),
loc, false,
new VariantContextBuilder(VCbase).alleles(A_C).genotypes(gA_C, new GenotypeBuilder("AA_A").PL(new int[]{30, 71, 73}).make()).make()});
// combination of all
tests.add(new Object[]{Arrays.asList(vcA_C_ALT, vcA_G_ALT, vcA_ATC_ALT, vcA_C_G_ALT, vcA_ALT, vcAA_ALT, vcAA_A_ALT),
loc, false,
new VariantContextBuilder(VCbase).alleles(Arrays.asList(Aref, C, ATC, G)).genotypes(new GenotypeBuilder("A_C").PL(new int[]{30, 20, 10, 71, 72, 73, 71, 72, 73, 73}).make(),
new GenotypeBuilder("A_G").PL(new int[]{30, 71, 73, 71, 73, 73, 20, 72, 72, 10}).make(),
new GenotypeBuilder("A_ATC").PL(new int[]{30, 71, 73, 20, 72, 10, 71, 73, 72, 73}).make(),
new GenotypeBuilder("A_C_G").PL(new int[]{40, 20, 30, 71, 72, 74, 20, 10, 73, 30}).make(),
new GenotypeBuilder("A").PL(new int[]{0, 100, 1000, 100, 1000, 1000, 100, 1000, 1000, 1000}).make(),
new GenotypeBuilder("AA").PL(new int[]{0, 80, 800, 80, 800, 800, 80, 800, 800, 800}).make(),
new GenotypeBuilder("AA_A").PL(new int[]{30, 71, 73, 71, 73, 73, 71, 73, 73, 73}).make()).make()});
// just spanning ref contexts, trying both instances where we want/do not want ref-only contexts
tests.add(new Object[]{Arrays.asList(vcAA_ALT),
loc, false,
null});
tests.add(new Object[]{Arrays.asList(vcAA_ALT),
loc, true,
new VariantContextBuilder(VCbase).alleles(Arrays.asList(Allele.create("A", true))).genotypes(new GenotypeBuilder("AA").PL(new int[]{0}).make()).make()});
return tests.toArray(new Object[][]{});
}
@Test(dataProvider = "referenceConfidenceMergeData")
public void testReferenceConfidenceMerge(final List<VariantContext> toMerge, final GenomeLoc loc, final boolean returnSiteEvenIfMonomorphic, final VariantContext expectedResult) {
final VariantContext result = GATKVariantContextUtils.referenceConfidenceMerge(toMerge, loc, returnSiteEvenIfMonomorphic ? (byte)'A' : null);
if ( result == null ) {
Assert.assertTrue(expectedResult == null);
return;
}
Assert.assertEquals(result.getAlleles(), expectedResult.getAlleles());
Assert.assertEquals(result.getNSamples(), expectedResult.getNSamples());
for ( final Genotype expectedGenotype : expectedResult.getGenotypes() ) {
Assert.assertTrue(result.hasGenotype(expectedGenotype.getSampleName()), "Missing " + expectedGenotype.getSampleName());
// use string comparisons to test equality for now
Assert.assertEquals(result.getGenotype(expectedGenotype.getSampleName()).toString(), expectedGenotype.toString());
}
}
}

View File

@ -234,7 +234,7 @@ public class VariantContextBenchmark extends SimpleBenchmark {
GATKVariantContextUtils.simpleMerge(toMerge, null,
GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED,
GATKVariantContextUtils.GenotypeMergeType.UNSORTED,
true, false, "set", false, true, false);
true, false, "set", false, true);
}
};