Optimization in CombineVariants when merging into a sites_only VCF
VariantContextUtils now was a utility function that creates a sitesOnlyVariantContext from an input VC Add complex merge test of SNPs and indels from the new batch merge wiki in : http://www.broadinstitute.org/gsa/wiki/index.php/Merging_batched_call_sets with multiple alleles for an indel. Created a BatchMergeIntegrationTest that uses GGA with the complex merged input alleles to genotype SNPs and Indels with multiple alleles simultaneously in NA12878. Looks great. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5959 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
1d6486a28f
commit
0f43b10c39
|
|
@ -26,6 +26,7 @@ package org.broadinstitute.sting.gatk.contexts.variantcontext;
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
|
||||||
|
import com.google.java.contract.*;
|
||||||
import net.sf.picard.reference.ReferenceSequenceFile;
|
import net.sf.picard.reference.ReferenceSequenceFile;
|
||||||
import net.sf.samtools.util.StringUtil;
|
import net.sf.samtools.util.StringUtil;
|
||||||
import org.apache.commons.jexl2.*;
|
import org.apache.commons.jexl2.*;
|
||||||
|
|
@ -282,11 +283,39 @@ public class VariantContextUtils {
|
||||||
return HardyWeinbergCalculation.hwCalculate(vc.getHomRefCount(), vc.getHetCount(), vc.getHomVarCount());
|
return HardyWeinbergCalculation.hwCalculate(vc.getHomRefCount(), vc.getHetCount(), vc.getHomVarCount());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns a newly allocated VC that is the same as VC, but without genotypes
|
||||||
|
* @param vc
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
@Requires("vc != null")
|
||||||
|
@Ensures("result != null")
|
||||||
|
public static VariantContext sitesOnlyVariantContext(VariantContext vc) {
|
||||||
|
return new VariantContext(vc.getSource(), vc.getChr(), vc.getStart(), vc.getEnd(),
|
||||||
|
vc.getAlleles(), vc.getNegLog10PError(),
|
||||||
|
vc.filtersWereApplied() ? vc.getFilters() : null,
|
||||||
|
vc.getAttributes());
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns a newly allocated list of VC, where each VC is the same as the input VCs, but without genotypes
|
||||||
|
* @param vcs
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
@Requires("vcs != null")
|
||||||
|
@Ensures("result != null")
|
||||||
|
public static Collection<VariantContext> sitesOnlyVariantContexts(Collection<VariantContext> vcs) {
|
||||||
|
List<VariantContext> r = new ArrayList<VariantContext>();
|
||||||
|
for ( VariantContext vc : vcs )
|
||||||
|
r.add(sitesOnlyVariantContext(vc));
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
public static VariantContext pruneVariantContext(VariantContext vc) {
|
public static VariantContext pruneVariantContext(VariantContext vc) {
|
||||||
return pruneVariantContext(vc, null);
|
return pruneVariantContext(vc, null);
|
||||||
}
|
}
|
||||||
|
|
||||||
public static VariantContext pruneVariantContext(VariantContext vc, Set<String> keysToPreserve ) {
|
public static VariantContext pruneVariantContext(VariantContext vc, Collection<String> keysToPreserve ) {
|
||||||
MutableVariantContext mvc = new MutableVariantContext(vc);
|
MutableVariantContext mvc = new MutableVariantContext(vc);
|
||||||
|
|
||||||
if ( keysToPreserve == null || keysToPreserve.size() == 0 )
|
if ( keysToPreserve == null || keysToPreserve.size() == 0 )
|
||||||
|
|
@ -304,9 +333,10 @@ public class VariantContextUtils {
|
||||||
for ( Genotype g : gs ) {
|
for ( Genotype g : gs ) {
|
||||||
MutableGenotype mg = new MutableGenotype(g);
|
MutableGenotype mg = new MutableGenotype(g);
|
||||||
mg.clearAttributes();
|
mg.clearAttributes();
|
||||||
for ( String key : keysToPreserve )
|
if ( keysToPreserve != null )
|
||||||
if ( g.hasAttribute(key) )
|
for ( String key : keysToPreserve )
|
||||||
mg.putAttribute(key, g.getAttribute(key));
|
if ( g.hasAttribute(key) )
|
||||||
|
mg.putAttribute(key, g.getAttribute(key));
|
||||||
mvc.addGenotype(mg);
|
mvc.addGenotype(mg);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -536,7 +566,7 @@ public class VariantContextUtils {
|
||||||
}
|
}
|
||||||
|
|
||||||
// if at least one record was unfiltered and we want a union, clear all of the filters
|
// if at least one record was unfiltered and we want a union, clear all of the filters
|
||||||
if ( filteredRecordMergeType == filteredRecordMergeType.KEEP_IF_ANY_UNFILTERED && nFiltered != VCs.size() )
|
if ( filteredRecordMergeType == FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED && nFiltered != VCs.size() )
|
||||||
filters.clear();
|
filters.clear();
|
||||||
|
|
||||||
// we care about where the call came from
|
// we care about where the call came from
|
||||||
|
|
@ -795,7 +825,7 @@ public class VariantContextUtils {
|
||||||
* @return the genomeLoc
|
* @return the genomeLoc
|
||||||
*/
|
*/
|
||||||
public static final GenomeLoc getLocation(GenomeLocParser genomeLocParser,VariantContext vc) {
|
public static final GenomeLoc getLocation(GenomeLocParser genomeLocParser,VariantContext vc) {
|
||||||
return genomeLocParser.createGenomeLoc(vc.getChr(),(int)vc.getStart(),(int)vc.getEnd(), true);
|
return genomeLocParser.createGenomeLoc(vc.getChr(), vc.getStart(), vc.getEnd(), true);
|
||||||
}
|
}
|
||||||
|
|
||||||
public abstract static class AlleleMergeRule {
|
public abstract static class AlleleMergeRule {
|
||||||
|
|
|
||||||
|
|
@ -25,6 +25,7 @@
|
||||||
|
|
||||||
package org.broadinstitute.sting.gatk.walkers.variantutils;
|
package org.broadinstitute.sting.gatk.walkers.variantutils;
|
||||||
|
|
||||||
|
import com.sun.tools.internal.xjc.reader.gbind.ElementSets;
|
||||||
import org.broad.tribble.util.variantcontext.VariantContext;
|
import org.broad.tribble.util.variantcontext.VariantContext;
|
||||||
import org.broad.tribble.vcf.*;
|
import org.broad.tribble.vcf.*;
|
||||||
import org.broadinstitute.sting.commandline.Hidden;
|
import org.broadinstitute.sting.commandline.Hidden;
|
||||||
|
|
@ -42,6 +43,7 @@ import org.broadinstitute.sting.commandline.Argument;
|
||||||
import org.broadinstitute.sting.commandline.Output;
|
import org.broadinstitute.sting.commandline.Output;
|
||||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||||
import org.broadinstitute.sting.utils.vcf.VCFUtils;
|
import org.broadinstitute.sting.utils.vcf.VCFUtils;
|
||||||
|
import org.broadinstitute.sting.gatk.io.stubs.VCFWriterStub;
|
||||||
|
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
|
||||||
|
|
@ -96,6 +98,9 @@ public class CombineVariants extends RodWalker<Integer, Integer> {
|
||||||
|
|
||||||
private List<String> priority = null;
|
private List<String> priority = null;
|
||||||
|
|
||||||
|
/** Optimization to strip out genotypes before merging if we are doing a sites_only output */
|
||||||
|
private boolean sitesOnlyVCF = false;
|
||||||
|
|
||||||
public void initialize() {
|
public void initialize() {
|
||||||
Map<String, VCFHeader> vcfRods = VCFUtils.getVCFHeadersFromRods(getToolkit(), null);
|
Map<String, VCFHeader> vcfRods = VCFUtils.getVCFHeadersFromRods(getToolkit(), null);
|
||||||
|
|
||||||
|
|
@ -113,7 +118,13 @@ public class CombineVariants extends RodWalker<Integer, Integer> {
|
||||||
Set<VCFHeaderLine> headerLines = VCFUtils.smartMergeHeaders(vcfRods.values(), logger);
|
Set<VCFHeaderLine> headerLines = VCFUtils.smartMergeHeaders(vcfRods.values(), logger);
|
||||||
if ( SET_KEY != null )
|
if ( SET_KEY != null )
|
||||||
headerLines.add(new VCFInfoHeaderLine(SET_KEY, 1, VCFHeaderLineType.String, "Source VCF for the merged record in CombineVariants"));
|
headerLines.add(new VCFInfoHeaderLine(SET_KEY, 1, VCFHeaderLineType.String, "Source VCF for the merged record in CombineVariants"));
|
||||||
vcfWriter.writeHeader(new VCFHeader(headerLines, samples));
|
vcfWriter.writeHeader(new VCFHeader(headerLines, sitesOnlyVCF ? Collections.<String>emptySet() : samples));
|
||||||
|
|
||||||
|
if ( vcfWriter instanceof VCFWriterStub) {
|
||||||
|
sitesOnlyVCF = ((VCFWriterStub)vcfWriter).doNotWriteGenotypes();
|
||||||
|
if ( sitesOnlyVCF ) logger.info("Pre-stripping genotypes for performance");
|
||||||
|
} else
|
||||||
|
logger.warn("VCF output file not an instance of VCFWriterStub; cannot enable sites only output option");
|
||||||
}
|
}
|
||||||
|
|
||||||
private void validateAnnotateUnionArguments() {
|
private void validateAnnotateUnionArguments() {
|
||||||
|
|
@ -142,6 +153,10 @@ public class CombineVariants extends RodWalker<Integer, Integer> {
|
||||||
// Need to provide reference bases to simpleMerge starting at current locus
|
// Need to provide reference bases to simpleMerge starting at current locus
|
||||||
Collection<VariantContext> vcs = tracker.getAllVariantContexts(ref, null,context.getLocation(), true, false);
|
Collection<VariantContext> vcs = tracker.getAllVariantContexts(ref, null,context.getLocation(), true, false);
|
||||||
|
|
||||||
|
if ( sitesOnlyVCF ) {
|
||||||
|
vcs = VariantContextUtils.sitesOnlyVariantContexts(vcs);
|
||||||
|
}
|
||||||
|
|
||||||
if ( ASSUME_IDENTICAL_SAMPLES ) {
|
if ( ASSUME_IDENTICAL_SAMPLES ) {
|
||||||
for ( final VariantContext vc : vcs ) {
|
for ( final VariantContext vc : vcs ) {
|
||||||
vcfWriter.add( vc, ref.getBase() );
|
vcfWriter.add( vc, ref.getBase() );
|
||||||
|
|
@ -154,13 +169,12 @@ public class CombineVariants extends RodWalker<Integer, Integer> {
|
||||||
for (VariantContext vc : vcs) {
|
for (VariantContext vc : vcs) {
|
||||||
if (vc.filtersWereApplied() && vc.isFiltered())
|
if (vc.filtersWereApplied() && vc.isFiltered())
|
||||||
numFilteredRecords++;
|
numFilteredRecords++;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (minimumN > 1 && (vcs.size() - numFilteredRecords < minimumN))
|
if (minimumN > 1 && (vcs.size() - numFilteredRecords < minimumN))
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
VariantContext mergedVC = null;
|
VariantContext mergedVC;
|
||||||
if ( master ) {
|
if ( master ) {
|
||||||
mergedVC = VariantContextUtils.masterMerge(vcs, "master");
|
mergedVC = VariantContextUtils.masterMerge(vcs, "master");
|
||||||
} else {
|
} else {
|
||||||
|
|
@ -176,7 +190,7 @@ public class CombineVariants extends RodWalker<Integer, Integer> {
|
||||||
VariantContextUtils.calculateChromosomeCounts(mergedVC, attributes, false);
|
VariantContextUtils.calculateChromosomeCounts(mergedVC, attributes, false);
|
||||||
VariantContext annotatedMergedVC = VariantContext.modifyAttributes(mergedVC, attributes);
|
VariantContext annotatedMergedVC = VariantContext.modifyAttributes(mergedVC, attributes);
|
||||||
if ( minimalVCF )
|
if ( minimalVCF )
|
||||||
annotatedMergedVC = VariantContextUtils.pruneVariantContext(annotatedMergedVC, new HashSet(Arrays.asList(SET_KEY)));
|
annotatedMergedVC = VariantContextUtils.pruneVariantContext(annotatedMergedVC, Arrays.asList(SET_KEY));
|
||||||
vcfWriter.add(annotatedMergedVC, ref.getBase());
|
vcfWriter.add(annotatedMergedVC, ref.getBase());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,46 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2010, The Broad Institute
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person
|
||||||
|
* obtaining a copy of this software and associated documentation
|
||||||
|
* files (the "Software"), to deal in the Software without
|
||||||
|
* restriction, including without limitation the rights to use,
|
||||||
|
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
* copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following
|
||||||
|
* conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be
|
||||||
|
* included in all copies or substantial portions of the Software.
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||||
|
* OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.broadinstitute.sting.gatk.walkers.variantutils;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.WalkerTest;
|
||||||
|
import org.testng.annotations.Test;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
import java.util.Arrays;
|
||||||
|
|
||||||
|
public class BatchMergeIntegrationTest extends WalkerTest {
|
||||||
|
@Test
|
||||||
|
public void testBatchMerge1() {
|
||||||
|
String bam = validationDataLocation + "NA12878.HiSeq.b37.chr20.10_11mb.bam";
|
||||||
|
String alleles = validationDataLocation + "batch.merge.alleles.vcf";
|
||||||
|
WalkerTestSpec spec = new WalkerTestSpec(
|
||||||
|
"-T UnifiedGenotyper -NO_HEADER -BTI alleles -glm BOTH -G none -nsl -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -o %s -R " + b37KGReference
|
||||||
|
+ " -B:alleles,VCF " + alleles
|
||||||
|
+ " -I " + bam,
|
||||||
|
1,
|
||||||
|
Arrays.asList("db0611522fb691adbd9903d325b879f7"));
|
||||||
|
executeTest("testBatchMerge UG genotype given alleles:" + new File(bam).getName() + " with " + new File(alleles).getName(), spec);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -34,7 +34,6 @@ import java.util.Arrays;
|
||||||
* Tests CombineVariants
|
* Tests CombineVariants
|
||||||
*/
|
*/
|
||||||
public class CombineVariantsIntegrationTest extends WalkerTest {
|
public class CombineVariantsIntegrationTest extends WalkerTest {
|
||||||
|
|
||||||
public static String baseTestString(String args) {
|
public static String baseTestString(String args) {
|
||||||
return "-T CombineVariants -NO_HEADER -L 1:1-50,000,000 -o %s -R " + b36KGReference + args;
|
return "-T CombineVariants -NO_HEADER -L 1:1-50,000,000 -o %s -R " + b36KGReference + args;
|
||||||
}
|
}
|
||||||
|
|
@ -104,6 +103,24 @@ public class CombineVariantsIntegrationTest extends WalkerTest {
|
||||||
1,
|
1,
|
||||||
Arrays.asList("a07995587b855f3214fb71940bf23c0f"));
|
Arrays.asList("a07995587b855f3214fb71940bf23c0f"));
|
||||||
executeTest("threeWayWithRefs", spec);
|
executeTest("threeWayWithRefs", spec);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// complex examples with filtering, indels, and multiple alleles
|
||||||
|
public void combineComplexSites(String args, String md5) {
|
||||||
|
String file1 = "combine.1.vcf";
|
||||||
|
String file2 = "combine.2.vcf";
|
||||||
|
WalkerTestSpec spec = new WalkerTestSpec(
|
||||||
|
"-T CombineVariants -NO_HEADER -o %s -R " + b37KGReference
|
||||||
|
+ " -B:one,VCF " + validationDataLocation + file1
|
||||||
|
+ " -B:two,VCF " + validationDataLocation + file2 + args,
|
||||||
|
1,
|
||||||
|
Arrays.asList(md5));
|
||||||
|
executeTest("combineComplexSites 1:" + new File(file1).getName() + " 2:" + new File(file2).getName() + " args = " + args, spec);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test public void complexTestFull() { combineComplexSites("", "ec593a12f932af92f0f6e38cf9c4bc60"); }
|
||||||
|
@Test public void complexTestMinimal() { combineComplexSites(" -minimalVCF", "977e27e407674e8a162f1773310294a5"); }
|
||||||
|
@Test public void complexTestSitesOnly() { combineComplexSites(" -sites_only", "4bcab3ae334c718f4cf3203fe38d7ce6"); }
|
||||||
|
@Test public void complexTestSitesOnlyMinimal() { combineComplexSites(" -sites_only -minimalVCF", "c70d0ccdb821921ea5bf7975364ad5a0"); }
|
||||||
}
|
}
|
||||||
Loading…
Reference in New Issue