Optimization in CombineVariants when merging into a sites_only VCF

VariantContextUtils now was a utility function that creates a sitesOnlyVariantContext from an input VC
Add complex merge test of SNPs and indels from the new batch merge wiki in :

http://www.broadinstitute.org/gsa/wiki/index.php/Merging_batched_call_sets

with multiple alleles for an indel.  Created a BatchMergeIntegrationTest that uses GGA with the complex merged input alleles to genotype SNPs and Indels with multiple alleles simultaneously in NA12878.  Looks great.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5959 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
depristo 2011-06-08 14:31:46 +00:00
parent 1d6486a28f
commit 0f43b10c39
4 changed files with 120 additions and 13 deletions

View File

@ -26,6 +26,7 @@ package org.broadinstitute.sting.gatk.contexts.variantcontext;
import java.io.Serializable;
import java.util.*;
import com.google.java.contract.*;
import net.sf.picard.reference.ReferenceSequenceFile;
import net.sf.samtools.util.StringUtil;
import org.apache.commons.jexl2.*;
@ -282,11 +283,39 @@ public class VariantContextUtils {
return HardyWeinbergCalculation.hwCalculate(vc.getHomRefCount(), vc.getHetCount(), vc.getHomVarCount());
}
/**
* Returns a newly allocated VC that is the same as VC, but without genotypes
* @param vc
* @return
*/
@Requires("vc != null")
@Ensures("result != null")
public static VariantContext sitesOnlyVariantContext(VariantContext vc) {
return new VariantContext(vc.getSource(), vc.getChr(), vc.getStart(), vc.getEnd(),
vc.getAlleles(), vc.getNegLog10PError(),
vc.filtersWereApplied() ? vc.getFilters() : null,
vc.getAttributes());
}
/**
* Returns a newly allocated list of VC, where each VC is the same as the input VCs, but without genotypes
* @param vcs
* @return
*/
@Requires("vcs != null")
@Ensures("result != null")
public static Collection<VariantContext> sitesOnlyVariantContexts(Collection<VariantContext> vcs) {
List<VariantContext> r = new ArrayList<VariantContext>();
for ( VariantContext vc : vcs )
r.add(sitesOnlyVariantContext(vc));
return r;
}
public static VariantContext pruneVariantContext(VariantContext vc) {
return pruneVariantContext(vc, null);
}
public static VariantContext pruneVariantContext(VariantContext vc, Set<String> keysToPreserve ) {
public static VariantContext pruneVariantContext(VariantContext vc, Collection<String> keysToPreserve ) {
MutableVariantContext mvc = new MutableVariantContext(vc);
if ( keysToPreserve == null || keysToPreserve.size() == 0 )
@ -304,9 +333,10 @@ public class VariantContextUtils {
for ( Genotype g : gs ) {
MutableGenotype mg = new MutableGenotype(g);
mg.clearAttributes();
for ( String key : keysToPreserve )
if ( g.hasAttribute(key) )
mg.putAttribute(key, g.getAttribute(key));
if ( keysToPreserve != null )
for ( String key : keysToPreserve )
if ( g.hasAttribute(key) )
mg.putAttribute(key, g.getAttribute(key));
mvc.addGenotype(mg);
}
@ -536,7 +566,7 @@ public class VariantContextUtils {
}
// if at least one record was unfiltered and we want a union, clear all of the filters
if ( filteredRecordMergeType == filteredRecordMergeType.KEEP_IF_ANY_UNFILTERED && nFiltered != VCs.size() )
if ( filteredRecordMergeType == FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED && nFiltered != VCs.size() )
filters.clear();
// we care about where the call came from
@ -795,7 +825,7 @@ public class VariantContextUtils {
* @return the genomeLoc
*/
public static final GenomeLoc getLocation(GenomeLocParser genomeLocParser,VariantContext vc) {
return genomeLocParser.createGenomeLoc(vc.getChr(),(int)vc.getStart(),(int)vc.getEnd(), true);
return genomeLocParser.createGenomeLoc(vc.getChr(), vc.getStart(), vc.getEnd(), true);
}
public abstract static class AlleleMergeRule {

View File

@ -25,6 +25,7 @@
package org.broadinstitute.sting.gatk.walkers.variantutils;
import com.sun.tools.internal.xjc.reader.gbind.ElementSets;
import org.broad.tribble.util.variantcontext.VariantContext;
import org.broad.tribble.vcf.*;
import org.broadinstitute.sting.commandline.Hidden;
@ -42,6 +43,7 @@ import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.vcf.VCFUtils;
import org.broadinstitute.sting.gatk.io.stubs.VCFWriterStub;
import java.util.*;
@ -96,6 +98,9 @@ public class CombineVariants extends RodWalker<Integer, Integer> {
private List<String> priority = null;
/** Optimization to strip out genotypes before merging if we are doing a sites_only output */
private boolean sitesOnlyVCF = false;
public void initialize() {
Map<String, VCFHeader> vcfRods = VCFUtils.getVCFHeadersFromRods(getToolkit(), null);
@ -113,7 +118,13 @@ public class CombineVariants extends RodWalker<Integer, Integer> {
Set<VCFHeaderLine> headerLines = VCFUtils.smartMergeHeaders(vcfRods.values(), logger);
if ( SET_KEY != null )
headerLines.add(new VCFInfoHeaderLine(SET_KEY, 1, VCFHeaderLineType.String, "Source VCF for the merged record in CombineVariants"));
vcfWriter.writeHeader(new VCFHeader(headerLines, samples));
vcfWriter.writeHeader(new VCFHeader(headerLines, sitesOnlyVCF ? Collections.<String>emptySet() : samples));
if ( vcfWriter instanceof VCFWriterStub) {
sitesOnlyVCF = ((VCFWriterStub)vcfWriter).doNotWriteGenotypes();
if ( sitesOnlyVCF ) logger.info("Pre-stripping genotypes for performance");
} else
logger.warn("VCF output file not an instance of VCFWriterStub; cannot enable sites only output option");
}
private void validateAnnotateUnionArguments() {
@ -142,6 +153,10 @@ public class CombineVariants extends RodWalker<Integer, Integer> {
// Need to provide reference bases to simpleMerge starting at current locus
Collection<VariantContext> vcs = tracker.getAllVariantContexts(ref, null,context.getLocation(), true, false);
if ( sitesOnlyVCF ) {
vcs = VariantContextUtils.sitesOnlyVariantContexts(vcs);
}
if ( ASSUME_IDENTICAL_SAMPLES ) {
for ( final VariantContext vc : vcs ) {
vcfWriter.add( vc, ref.getBase() );
@ -154,13 +169,12 @@ public class CombineVariants extends RodWalker<Integer, Integer> {
for (VariantContext vc : vcs) {
if (vc.filtersWereApplied() && vc.isFiltered())
numFilteredRecords++;
}
if (minimumN > 1 && (vcs.size() - numFilteredRecords < minimumN))
return 0;
VariantContext mergedVC = null;
VariantContext mergedVC;
if ( master ) {
mergedVC = VariantContextUtils.masterMerge(vcs, "master");
} else {
@ -176,7 +190,7 @@ public class CombineVariants extends RodWalker<Integer, Integer> {
VariantContextUtils.calculateChromosomeCounts(mergedVC, attributes, false);
VariantContext annotatedMergedVC = VariantContext.modifyAttributes(mergedVC, attributes);
if ( minimalVCF )
annotatedMergedVC = VariantContextUtils.pruneVariantContext(annotatedMergedVC, new HashSet(Arrays.asList(SET_KEY)));
annotatedMergedVC = VariantContextUtils.pruneVariantContext(annotatedMergedVC, Arrays.asList(SET_KEY));
vcfWriter.add(annotatedMergedVC, ref.getBase());
}

View File

@ -0,0 +1,46 @@
/*
* Copyright (c) 2010, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.variantutils;
import org.broadinstitute.sting.WalkerTest;
import org.testng.annotations.Test;
import java.io.File;
import java.util.Arrays;
public class BatchMergeIntegrationTest extends WalkerTest {
@Test
public void testBatchMerge1() {
String bam = validationDataLocation + "NA12878.HiSeq.b37.chr20.10_11mb.bam";
String alleles = validationDataLocation + "batch.merge.alleles.vcf";
WalkerTestSpec spec = new WalkerTestSpec(
"-T UnifiedGenotyper -NO_HEADER -BTI alleles -glm BOTH -G none -nsl -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -o %s -R " + b37KGReference
+ " -B:alleles,VCF " + alleles
+ " -I " + bam,
1,
Arrays.asList("db0611522fb691adbd9903d325b879f7"));
executeTest("testBatchMerge UG genotype given alleles:" + new File(bam).getName() + " with " + new File(alleles).getName(), spec);
}
}

View File

@ -34,7 +34,6 @@ import java.util.Arrays;
* Tests CombineVariants
*/
public class CombineVariantsIntegrationTest extends WalkerTest {
public static String baseTestString(String args) {
return "-T CombineVariants -NO_HEADER -L 1:1-50,000,000 -o %s -R " + b36KGReference + args;
}
@ -104,6 +103,24 @@ public class CombineVariantsIntegrationTest extends WalkerTest {
1,
Arrays.asList("a07995587b855f3214fb71940bf23c0f"));
executeTest("threeWayWithRefs", spec);
}
}
// complex examples with filtering, indels, and multiple alleles
public void combineComplexSites(String args, String md5) {
String file1 = "combine.1.vcf";
String file2 = "combine.2.vcf";
WalkerTestSpec spec = new WalkerTestSpec(
"-T CombineVariants -NO_HEADER -o %s -R " + b37KGReference
+ " -B:one,VCF " + validationDataLocation + file1
+ " -B:two,VCF " + validationDataLocation + file2 + args,
1,
Arrays.asList(md5));
executeTest("combineComplexSites 1:" + new File(file1).getName() + " 2:" + new File(file2).getName() + " args = " + args, spec);
}
@Test public void complexTestFull() { combineComplexSites("", "ec593a12f932af92f0f6e38cf9c4bc60"); }
@Test public void complexTestMinimal() { combineComplexSites(" -minimalVCF", "977e27e407674e8a162f1773310294a5"); }
@Test public void complexTestSitesOnly() { combineComplexSites(" -sites_only", "4bcab3ae334c718f4cf3203fe38d7ce6"); }
@Test public void complexTestSitesOnlyMinimal() { combineComplexSites(" -sites_only -minimalVCF", "c70d0ccdb821921ea5bf7975364ad5a0"); }
}