diff --git a/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/VariantContextUtils.java b/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/VariantContextUtils.java index fe6464c49..60c17d17c 100755 --- a/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/VariantContextUtils.java +++ b/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/VariantContextUtils.java @@ -26,6 +26,7 @@ package org.broadinstitute.sting.gatk.contexts.variantcontext; import java.io.Serializable; import java.util.*; +import com.google.java.contract.*; import net.sf.picard.reference.ReferenceSequenceFile; import net.sf.samtools.util.StringUtil; import org.apache.commons.jexl2.*; @@ -282,11 +283,39 @@ public class VariantContextUtils { return HardyWeinbergCalculation.hwCalculate(vc.getHomRefCount(), vc.getHetCount(), vc.getHomVarCount()); } + /** + * Returns a newly allocated VC that is the same as VC, but without genotypes + * @param vc + * @return + */ + @Requires("vc != null") + @Ensures("result != null") + public static VariantContext sitesOnlyVariantContext(VariantContext vc) { + return new VariantContext(vc.getSource(), vc.getChr(), vc.getStart(), vc.getEnd(), + vc.getAlleles(), vc.getNegLog10PError(), + vc.filtersWereApplied() ? vc.getFilters() : null, + vc.getAttributes()); + } + + /** + * Returns a newly allocated list of VC, where each VC is the same as the input VCs, but without genotypes + * @param vcs + * @return + */ + @Requires("vcs != null") + @Ensures("result != null") + public static Collection sitesOnlyVariantContexts(Collection vcs) { + List r = new ArrayList(); + for ( VariantContext vc : vcs ) + r.add(sitesOnlyVariantContext(vc)); + return r; + } + public static VariantContext pruneVariantContext(VariantContext vc) { return pruneVariantContext(vc, null); } - public static VariantContext pruneVariantContext(VariantContext vc, Set keysToPreserve ) { + public static VariantContext pruneVariantContext(VariantContext vc, Collection keysToPreserve ) { MutableVariantContext mvc = new MutableVariantContext(vc); if ( keysToPreserve == null || keysToPreserve.size() == 0 ) @@ -304,9 +333,10 @@ public class VariantContextUtils { for ( Genotype g : gs ) { MutableGenotype mg = new MutableGenotype(g); mg.clearAttributes(); - for ( String key : keysToPreserve ) - if ( g.hasAttribute(key) ) - mg.putAttribute(key, g.getAttribute(key)); + if ( keysToPreserve != null ) + for ( String key : keysToPreserve ) + if ( g.hasAttribute(key) ) + mg.putAttribute(key, g.getAttribute(key)); mvc.addGenotype(mg); } @@ -536,7 +566,7 @@ public class VariantContextUtils { } // if at least one record was unfiltered and we want a union, clear all of the filters - if ( filteredRecordMergeType == filteredRecordMergeType.KEEP_IF_ANY_UNFILTERED && nFiltered != VCs.size() ) + if ( filteredRecordMergeType == FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED && nFiltered != VCs.size() ) filters.clear(); // we care about where the call came from @@ -795,7 +825,7 @@ public class VariantContextUtils { * @return the genomeLoc */ public static final GenomeLoc getLocation(GenomeLocParser genomeLocParser,VariantContext vc) { - return genomeLocParser.createGenomeLoc(vc.getChr(),(int)vc.getStart(),(int)vc.getEnd(), true); + return genomeLocParser.createGenomeLoc(vc.getChr(), vc.getStart(), vc.getEnd(), true); } public abstract static class AlleleMergeRule { diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java b/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java index e905f647e..4cee70def 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java @@ -25,6 +25,7 @@ package org.broadinstitute.sting.gatk.walkers.variantutils; +import com.sun.tools.internal.xjc.reader.gbind.ElementSets; import org.broad.tribble.util.variantcontext.VariantContext; import org.broad.tribble.vcf.*; import org.broadinstitute.sting.commandline.Hidden; @@ -42,6 +43,7 @@ import org.broadinstitute.sting.commandline.Argument; import org.broadinstitute.sting.commandline.Output; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.vcf.VCFUtils; +import org.broadinstitute.sting.gatk.io.stubs.VCFWriterStub; import java.util.*; @@ -96,6 +98,9 @@ public class CombineVariants extends RodWalker { private List priority = null; + /** Optimization to strip out genotypes before merging if we are doing a sites_only output */ + private boolean sitesOnlyVCF = false; + public void initialize() { Map vcfRods = VCFUtils.getVCFHeadersFromRods(getToolkit(), null); @@ -113,7 +118,13 @@ public class CombineVariants extends RodWalker { Set headerLines = VCFUtils.smartMergeHeaders(vcfRods.values(), logger); if ( SET_KEY != null ) headerLines.add(new VCFInfoHeaderLine(SET_KEY, 1, VCFHeaderLineType.String, "Source VCF for the merged record in CombineVariants")); - vcfWriter.writeHeader(new VCFHeader(headerLines, samples)); + vcfWriter.writeHeader(new VCFHeader(headerLines, sitesOnlyVCF ? Collections.emptySet() : samples)); + + if ( vcfWriter instanceof VCFWriterStub) { + sitesOnlyVCF = ((VCFWriterStub)vcfWriter).doNotWriteGenotypes(); + if ( sitesOnlyVCF ) logger.info("Pre-stripping genotypes for performance"); + } else + logger.warn("VCF output file not an instance of VCFWriterStub; cannot enable sites only output option"); } private void validateAnnotateUnionArguments() { @@ -142,6 +153,10 @@ public class CombineVariants extends RodWalker { // Need to provide reference bases to simpleMerge starting at current locus Collection vcs = tracker.getAllVariantContexts(ref, null,context.getLocation(), true, false); + if ( sitesOnlyVCF ) { + vcs = VariantContextUtils.sitesOnlyVariantContexts(vcs); + } + if ( ASSUME_IDENTICAL_SAMPLES ) { for ( final VariantContext vc : vcs ) { vcfWriter.add( vc, ref.getBase() ); @@ -154,13 +169,12 @@ public class CombineVariants extends RodWalker { for (VariantContext vc : vcs) { if (vc.filtersWereApplied() && vc.isFiltered()) numFilteredRecords++; - } if (minimumN > 1 && (vcs.size() - numFilteredRecords < minimumN)) return 0; - VariantContext mergedVC = null; + VariantContext mergedVC; if ( master ) { mergedVC = VariantContextUtils.masterMerge(vcs, "master"); } else { @@ -176,7 +190,7 @@ public class CombineVariants extends RodWalker { VariantContextUtils.calculateChromosomeCounts(mergedVC, attributes, false); VariantContext annotatedMergedVC = VariantContext.modifyAttributes(mergedVC, attributes); if ( minimalVCF ) - annotatedMergedVC = VariantContextUtils.pruneVariantContext(annotatedMergedVC, new HashSet(Arrays.asList(SET_KEY))); + annotatedMergedVC = VariantContextUtils.pruneVariantContext(annotatedMergedVC, Arrays.asList(SET_KEY)); vcfWriter.add(annotatedMergedVC, ref.getBase()); } diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/BatchMergeIntegrationTest.java b/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/BatchMergeIntegrationTest.java new file mode 100755 index 000000000..d08d2c75c --- /dev/null +++ b/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/BatchMergeIntegrationTest.java @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2010, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.variantutils; + +import org.broadinstitute.sting.WalkerTest; +import org.testng.annotations.Test; + +import java.io.File; +import java.util.Arrays; + +public class BatchMergeIntegrationTest extends WalkerTest { + @Test + public void testBatchMerge1() { + String bam = validationDataLocation + "NA12878.HiSeq.b37.chr20.10_11mb.bam"; + String alleles = validationDataLocation + "batch.merge.alleles.vcf"; + WalkerTestSpec spec = new WalkerTestSpec( + "-T UnifiedGenotyper -NO_HEADER -BTI alleles -glm BOTH -G none -nsl -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -o %s -R " + b37KGReference + + " -B:alleles,VCF " + alleles + + " -I " + bam, + 1, + Arrays.asList("db0611522fb691adbd9903d325b879f7")); + executeTest("testBatchMerge UG genotype given alleles:" + new File(bam).getName() + " with " + new File(alleles).getName(), spec); + } +} \ No newline at end of file diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsIntegrationTest.java b/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsIntegrationTest.java index 33c595f8c..851e1753b 100755 --- a/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsIntegrationTest.java +++ b/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsIntegrationTest.java @@ -34,7 +34,6 @@ import java.util.Arrays; * Tests CombineVariants */ public class CombineVariantsIntegrationTest extends WalkerTest { - public static String baseTestString(String args) { return "-T CombineVariants -NO_HEADER -L 1:1-50,000,000 -o %s -R " + b36KGReference + args; } @@ -104,6 +103,24 @@ public class CombineVariantsIntegrationTest extends WalkerTest { 1, Arrays.asList("a07995587b855f3214fb71940bf23c0f")); executeTest("threeWayWithRefs", spec); - } -} + + + // complex examples with filtering, indels, and multiple alleles + public void combineComplexSites(String args, String md5) { + String file1 = "combine.1.vcf"; + String file2 = "combine.2.vcf"; + WalkerTestSpec spec = new WalkerTestSpec( + "-T CombineVariants -NO_HEADER -o %s -R " + b37KGReference + + " -B:one,VCF " + validationDataLocation + file1 + + " -B:two,VCF " + validationDataLocation + file2 + args, + 1, + Arrays.asList(md5)); + executeTest("combineComplexSites 1:" + new File(file1).getName() + " 2:" + new File(file2).getName() + " args = " + args, spec); + } + + @Test public void complexTestFull() { combineComplexSites("", "ec593a12f932af92f0f6e38cf9c4bc60"); } + @Test public void complexTestMinimal() { combineComplexSites(" -minimalVCF", "977e27e407674e8a162f1773310294a5"); } + @Test public void complexTestSitesOnly() { combineComplexSites(" -sites_only", "4bcab3ae334c718f4cf3203fe38d7ce6"); } + @Test public void complexTestSitesOnlyMinimal() { combineComplexSites(" -sites_only -minimalVCF", "c70d0ccdb821921ea5bf7975364ad5a0"); } +} \ No newline at end of file