Optimization in CombineVariants when merging into a sites_only VCF
VariantContextUtils now was a utility function that creates a sitesOnlyVariantContext from an input VC Add complex merge test of SNPs and indels from the new batch merge wiki in : http://www.broadinstitute.org/gsa/wiki/index.php/Merging_batched_call_sets with multiple alleles for an indel. Created a BatchMergeIntegrationTest that uses GGA with the complex merged input alleles to genotype SNPs and Indels with multiple alleles simultaneously in NA12878. Looks great. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5959 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
1d6486a28f
commit
0f43b10c39
|
|
@ -26,6 +26,7 @@ package org.broadinstitute.sting.gatk.contexts.variantcontext;
|
|||
import java.io.Serializable;
|
||||
import java.util.*;
|
||||
|
||||
import com.google.java.contract.*;
|
||||
import net.sf.picard.reference.ReferenceSequenceFile;
|
||||
import net.sf.samtools.util.StringUtil;
|
||||
import org.apache.commons.jexl2.*;
|
||||
|
|
@ -282,11 +283,39 @@ public class VariantContextUtils {
|
|||
return HardyWeinbergCalculation.hwCalculate(vc.getHomRefCount(), vc.getHetCount(), vc.getHomVarCount());
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a newly allocated VC that is the same as VC, but without genotypes
|
||||
* @param vc
|
||||
* @return
|
||||
*/
|
||||
@Requires("vc != null")
|
||||
@Ensures("result != null")
|
||||
public static VariantContext sitesOnlyVariantContext(VariantContext vc) {
|
||||
return new VariantContext(vc.getSource(), vc.getChr(), vc.getStart(), vc.getEnd(),
|
||||
vc.getAlleles(), vc.getNegLog10PError(),
|
||||
vc.filtersWereApplied() ? vc.getFilters() : null,
|
||||
vc.getAttributes());
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a newly allocated list of VC, where each VC is the same as the input VCs, but without genotypes
|
||||
* @param vcs
|
||||
* @return
|
||||
*/
|
||||
@Requires("vcs != null")
|
||||
@Ensures("result != null")
|
||||
public static Collection<VariantContext> sitesOnlyVariantContexts(Collection<VariantContext> vcs) {
|
||||
List<VariantContext> r = new ArrayList<VariantContext>();
|
||||
for ( VariantContext vc : vcs )
|
||||
r.add(sitesOnlyVariantContext(vc));
|
||||
return r;
|
||||
}
|
||||
|
||||
public static VariantContext pruneVariantContext(VariantContext vc) {
|
||||
return pruneVariantContext(vc, null);
|
||||
}
|
||||
|
||||
public static VariantContext pruneVariantContext(VariantContext vc, Set<String> keysToPreserve ) {
|
||||
public static VariantContext pruneVariantContext(VariantContext vc, Collection<String> keysToPreserve ) {
|
||||
MutableVariantContext mvc = new MutableVariantContext(vc);
|
||||
|
||||
if ( keysToPreserve == null || keysToPreserve.size() == 0 )
|
||||
|
|
@ -304,9 +333,10 @@ public class VariantContextUtils {
|
|||
for ( Genotype g : gs ) {
|
||||
MutableGenotype mg = new MutableGenotype(g);
|
||||
mg.clearAttributes();
|
||||
for ( String key : keysToPreserve )
|
||||
if ( g.hasAttribute(key) )
|
||||
mg.putAttribute(key, g.getAttribute(key));
|
||||
if ( keysToPreserve != null )
|
||||
for ( String key : keysToPreserve )
|
||||
if ( g.hasAttribute(key) )
|
||||
mg.putAttribute(key, g.getAttribute(key));
|
||||
mvc.addGenotype(mg);
|
||||
}
|
||||
|
||||
|
|
@ -536,7 +566,7 @@ public class VariantContextUtils {
|
|||
}
|
||||
|
||||
// if at least one record was unfiltered and we want a union, clear all of the filters
|
||||
if ( filteredRecordMergeType == filteredRecordMergeType.KEEP_IF_ANY_UNFILTERED && nFiltered != VCs.size() )
|
||||
if ( filteredRecordMergeType == FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED && nFiltered != VCs.size() )
|
||||
filters.clear();
|
||||
|
||||
// we care about where the call came from
|
||||
|
|
@ -795,7 +825,7 @@ public class VariantContextUtils {
|
|||
* @return the genomeLoc
|
||||
*/
|
||||
public static final GenomeLoc getLocation(GenomeLocParser genomeLocParser,VariantContext vc) {
|
||||
return genomeLocParser.createGenomeLoc(vc.getChr(),(int)vc.getStart(),(int)vc.getEnd(), true);
|
||||
return genomeLocParser.createGenomeLoc(vc.getChr(), vc.getStart(), vc.getEnd(), true);
|
||||
}
|
||||
|
||||
public abstract static class AlleleMergeRule {
|
||||
|
|
|
|||
|
|
@ -25,6 +25,7 @@
|
|||
|
||||
package org.broadinstitute.sting.gatk.walkers.variantutils;
|
||||
|
||||
import com.sun.tools.internal.xjc.reader.gbind.ElementSets;
|
||||
import org.broad.tribble.util.variantcontext.VariantContext;
|
||||
import org.broad.tribble.vcf.*;
|
||||
import org.broadinstitute.sting.commandline.Hidden;
|
||||
|
|
@ -42,6 +43,7 @@ import org.broadinstitute.sting.commandline.Argument;
|
|||
import org.broadinstitute.sting.commandline.Output;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.vcf.VCFUtils;
|
||||
import org.broadinstitute.sting.gatk.io.stubs.VCFWriterStub;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
|
|
@ -96,6 +98,9 @@ public class CombineVariants extends RodWalker<Integer, Integer> {
|
|||
|
||||
private List<String> priority = null;
|
||||
|
||||
/** Optimization to strip out genotypes before merging if we are doing a sites_only output */
|
||||
private boolean sitesOnlyVCF = false;
|
||||
|
||||
public void initialize() {
|
||||
Map<String, VCFHeader> vcfRods = VCFUtils.getVCFHeadersFromRods(getToolkit(), null);
|
||||
|
||||
|
|
@ -113,7 +118,13 @@ public class CombineVariants extends RodWalker<Integer, Integer> {
|
|||
Set<VCFHeaderLine> headerLines = VCFUtils.smartMergeHeaders(vcfRods.values(), logger);
|
||||
if ( SET_KEY != null )
|
||||
headerLines.add(new VCFInfoHeaderLine(SET_KEY, 1, VCFHeaderLineType.String, "Source VCF for the merged record in CombineVariants"));
|
||||
vcfWriter.writeHeader(new VCFHeader(headerLines, samples));
|
||||
vcfWriter.writeHeader(new VCFHeader(headerLines, sitesOnlyVCF ? Collections.<String>emptySet() : samples));
|
||||
|
||||
if ( vcfWriter instanceof VCFWriterStub) {
|
||||
sitesOnlyVCF = ((VCFWriterStub)vcfWriter).doNotWriteGenotypes();
|
||||
if ( sitesOnlyVCF ) logger.info("Pre-stripping genotypes for performance");
|
||||
} else
|
||||
logger.warn("VCF output file not an instance of VCFWriterStub; cannot enable sites only output option");
|
||||
}
|
||||
|
||||
private void validateAnnotateUnionArguments() {
|
||||
|
|
@ -142,6 +153,10 @@ public class CombineVariants extends RodWalker<Integer, Integer> {
|
|||
// Need to provide reference bases to simpleMerge starting at current locus
|
||||
Collection<VariantContext> vcs = tracker.getAllVariantContexts(ref, null,context.getLocation(), true, false);
|
||||
|
||||
if ( sitesOnlyVCF ) {
|
||||
vcs = VariantContextUtils.sitesOnlyVariantContexts(vcs);
|
||||
}
|
||||
|
||||
if ( ASSUME_IDENTICAL_SAMPLES ) {
|
||||
for ( final VariantContext vc : vcs ) {
|
||||
vcfWriter.add( vc, ref.getBase() );
|
||||
|
|
@ -154,13 +169,12 @@ public class CombineVariants extends RodWalker<Integer, Integer> {
|
|||
for (VariantContext vc : vcs) {
|
||||
if (vc.filtersWereApplied() && vc.isFiltered())
|
||||
numFilteredRecords++;
|
||||
|
||||
}
|
||||
|
||||
if (minimumN > 1 && (vcs.size() - numFilteredRecords < minimumN))
|
||||
return 0;
|
||||
|
||||
VariantContext mergedVC = null;
|
||||
VariantContext mergedVC;
|
||||
if ( master ) {
|
||||
mergedVC = VariantContextUtils.masterMerge(vcs, "master");
|
||||
} else {
|
||||
|
|
@ -176,7 +190,7 @@ public class CombineVariants extends RodWalker<Integer, Integer> {
|
|||
VariantContextUtils.calculateChromosomeCounts(mergedVC, attributes, false);
|
||||
VariantContext annotatedMergedVC = VariantContext.modifyAttributes(mergedVC, attributes);
|
||||
if ( minimalVCF )
|
||||
annotatedMergedVC = VariantContextUtils.pruneVariantContext(annotatedMergedVC, new HashSet(Arrays.asList(SET_KEY)));
|
||||
annotatedMergedVC = VariantContextUtils.pruneVariantContext(annotatedMergedVC, Arrays.asList(SET_KEY));
|
||||
vcfWriter.add(annotatedMergedVC, ref.getBase());
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,46 @@
|
|||
/*
|
||||
* Copyright (c) 2010, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.variantutils;
|
||||
|
||||
import org.broadinstitute.sting.WalkerTest;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.Arrays;
|
||||
|
||||
public class BatchMergeIntegrationTest extends WalkerTest {
|
||||
@Test
|
||||
public void testBatchMerge1() {
|
||||
String bam = validationDataLocation + "NA12878.HiSeq.b37.chr20.10_11mb.bam";
|
||||
String alleles = validationDataLocation + "batch.merge.alleles.vcf";
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
"-T UnifiedGenotyper -NO_HEADER -BTI alleles -glm BOTH -G none -nsl -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -o %s -R " + b37KGReference
|
||||
+ " -B:alleles,VCF " + alleles
|
||||
+ " -I " + bam,
|
||||
1,
|
||||
Arrays.asList("db0611522fb691adbd9903d325b879f7"));
|
||||
executeTest("testBatchMerge UG genotype given alleles:" + new File(bam).getName() + " with " + new File(alleles).getName(), spec);
|
||||
}
|
||||
}
|
||||
|
|
@ -34,7 +34,6 @@ import java.util.Arrays;
|
|||
* Tests CombineVariants
|
||||
*/
|
||||
public class CombineVariantsIntegrationTest extends WalkerTest {
|
||||
|
||||
public static String baseTestString(String args) {
|
||||
return "-T CombineVariants -NO_HEADER -L 1:1-50,000,000 -o %s -R " + b36KGReference + args;
|
||||
}
|
||||
|
|
@ -104,6 +103,24 @@ public class CombineVariantsIntegrationTest extends WalkerTest {
|
|||
1,
|
||||
Arrays.asList("a07995587b855f3214fb71940bf23c0f"));
|
||||
executeTest("threeWayWithRefs", spec);
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// complex examples with filtering, indels, and multiple alleles
|
||||
public void combineComplexSites(String args, String md5) {
|
||||
String file1 = "combine.1.vcf";
|
||||
String file2 = "combine.2.vcf";
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
"-T CombineVariants -NO_HEADER -o %s -R " + b37KGReference
|
||||
+ " -B:one,VCF " + validationDataLocation + file1
|
||||
+ " -B:two,VCF " + validationDataLocation + file2 + args,
|
||||
1,
|
||||
Arrays.asList(md5));
|
||||
executeTest("combineComplexSites 1:" + new File(file1).getName() + " 2:" + new File(file2).getName() + " args = " + args, spec);
|
||||
}
|
||||
|
||||
@Test public void complexTestFull() { combineComplexSites("", "ec593a12f932af92f0f6e38cf9c4bc60"); }
|
||||
@Test public void complexTestMinimal() { combineComplexSites(" -minimalVCF", "977e27e407674e8a162f1773310294a5"); }
|
||||
@Test public void complexTestSitesOnly() { combineComplexSites(" -sites_only", "4bcab3ae334c718f4cf3203fe38d7ce6"); }
|
||||
@Test public void complexTestSitesOnlyMinimal() { combineComplexSites(" -sites_only -minimalVCF", "c70d0ccdb821921ea5bf7975364ad5a0"); }
|
||||
}
|
||||
Loading…
Reference in New Issue