From b4447cdca2fb6c791f5fbea93e898e96a8b5d0d2 Mon Sep 17 00:00:00 2001 From: Ami Levy-Moonshine Date: Fri, 25 Jan 2013 15:49:51 -0500 Subject: [PATCH] In cases where one uses VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE we used to verify that the samples names are unique in VariantContextUtils.simpleMerge for each VCs. It couse to a bug that was reported on the forum (when a VCs had 2 VC from the same sample). Now we will check it only in CombineVariants.init using the headers. A new function was added to SamplesUtils with unitTests in CVunitTest.java. --- .../variantutils/CombineVariantsUnitTest.java | 51 ++++++++++++++++++- .../walkers/variantutils/CombineVariants.java | 6 +++ .../sting/utils/SampleUtils.java | 19 +++++++ .../variantcontext/VariantContextUtils.java | 32 ++++++------ .../VariantContextUtilsUnitTest.java | 19 +++---- 5 files changed, 100 insertions(+), 27 deletions(-) diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsUnitTest.java index 1fb056c78..46bd5af3c 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsUnitTest.java @@ -48,14 +48,14 @@ package org.broadinstitute.sting.gatk.walkers.variantutils; import org.broad.tribble.readers.AsciiLineReader; import org.broad.tribble.readers.PositionalBufferedStream; +import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.variant.vcf.*; import org.testng.Assert; import org.testng.annotations.Test; import java.io.StringBufferInputStream; -import java.util.ArrayList; -import java.util.Set; +import java.util.*; /** * test out pieces of the combine variants code @@ -76,6 +76,33 @@ public class CombineVariantsUnitTest { "##FORMAT=\n"+ "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n"; + public static String VCF4headerStringsWithSamplesName = + "##fileformat=VCFv4.0\n" + + "##filedate=2010-06-21\n"+ + "##reference=NCBI36\n"+ + "##INFO=\n"+ + "##INFO=\n"+ + "##INFO=\n"+ + "##FILTER=\n"+ + "##FORMAT=\n"+ + "##FORMAT=\n"+ + "##FORMAT=\n"+ + "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tNA12878\tNA12891\n"; + + public static String VCF4headerStringsWithUniqueSamplesName = + "##fileformat=VCFv4.0\n" + + "##filedate=2010-06-21\n"+ + "##reference=NCBI36\n"+ + "##INFO=\n"+ + "##INFO=\n"+ + "##INFO=\n"+ + "##FILTER=\n"+ + "##FORMAT=\n"+ + "##FORMAT=\n"+ + "##FORMAT=\n"+ + "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tNA12892\n"; + + // altered info field public static String VCF4headerStringsBrokenInfo = "##fileformat=VCFv4.0\n"+ @@ -110,6 +137,26 @@ public class CombineVariantsUnitTest { return head; } + @Test + public void testHeadersWithSamplesNamesDuplicationThatIsNotAllowed() { + VCFHeader one = createHeader(VCF4headerStringsWithSamplesName); + VCFHeader two = createHeader(VCF4headerStringsWithSamplesName); + Map headers = new HashMap(); + headers.put("VCF4headerStringsWithSamplesName",one); + headers.put("VCF4headerStringsWithSamplesName2",two); + Assert.assertEquals(SampleUtils.verifyUniqueSamplesNames(headers),false); + } + + @Test + public void testHeadersWithoutSamplesNamesDuplication() { + VCFHeader one = createHeader(VCF4headerStringsWithSamplesName); + VCFHeader two = createHeader(VCF4headerStringsWithUniqueSamplesName); + Map headers = new HashMap(); + headers.put("VCF4headerStringsWithSamplesName",one); + headers.put("VCF4headerStringsWithSamplesName2",two); + Assert.assertEquals(SampleUtils.verifyUniqueSamplesNames(headers),true); + } + @Test public void testHeadersWhereOneIsAStrictSubsetOfTheOther() { VCFHeader one = createHeader(VCFHeaderUnitTest.VCF4headerStrings); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java index 996eab7e9..3ebac3e58 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java @@ -208,6 +208,10 @@ public class CombineVariants extends RodWalker implements Tree logger.info("Priority string is not provided, using arbitrary genotyping order: "+priority); } + if (genotypeMergeOption == VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE && + !SampleUtils.verifyUniqueSamplesNames(vcfRods)) + throw new IllegalStateException("REQUIRE_UNIQUE sample names is true but duplicate names were discovered."); + samples = sitesOnlyVCF ? Collections.emptySet() : SampleUtils.getSampleList(vcfRods, genotypeMergeOption); if ( SET_KEY.toLowerCase().equals("null") ) @@ -223,6 +227,8 @@ public class CombineVariants extends RodWalker implements Tree vcfWriter.writeHeader(vcfHeader); } + + private void validateAnnotateUnionArguments() { Set rodNames = SampleUtils.getRodNamesWithVCFHeader(getToolkit(), null); diff --git a/public/java/src/org/broadinstitute/sting/utils/SampleUtils.java b/public/java/src/org/broadinstitute/sting/utils/SampleUtils.java index 94e073ca3..6a430588b 100644 --- a/public/java/src/org/broadinstitute/sting/utils/SampleUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/SampleUtils.java @@ -132,6 +132,25 @@ public class SampleUtils { return samples; } + + /** + * + * @param VCF_Headers + * @return false if there are names duplication between the samples names in the VCF headers + */ + public static boolean verifyUniqueSamplesNames(Map VCF_Headers) { + Set samples = new HashSet(); + for ( Map.Entry val : VCF_Headers.entrySet() ) { + VCFHeader header = val.getValue(); + for ( String sample : header.getGenotypeSamples() ) { + if (samples.add(sample)) + return false; + } + } + + return true; + } + /** * Gets the sample names from all VCF rods input by the user and uniquifies them if there is overlap * (e.g. sampleX.1, sampleX.2, ...) diff --git a/public/java/src/org/broadinstitute/variant/variantcontext/VariantContextUtils.java b/public/java/src/org/broadinstitute/variant/variantcontext/VariantContextUtils.java index 0a848fba1..b2f41d43b 100644 --- a/public/java/src/org/broadinstitute/variant/variantcontext/VariantContextUtils.java +++ b/public/java/src/org/broadinstitute/variant/variantcontext/VariantContextUtils.java @@ -450,7 +450,9 @@ public class VariantContextUtils { /** * Merges VariantContexts into a single hybrid. Takes genotypes for common samples in priority order, if provided. * If uniquifySamples is true, the priority order is ignored and names are created by concatenating the VC name with - * the sample name + * the sample name. + * simpleMerge does not verify any more unique sample names EVEN if genotypeMergeOptions == GenotypeMergeType.REQUIRE_UNIQUE. One should use + * SampleUtils.verifyUniqueSamplesNames to check that before using sempleMerge. * * @param unsortedVCs collection of unsorted VCs * @param priorityListOfVCs priority list detailing the order in which we should grab the VCs @@ -483,9 +485,6 @@ public class VariantContextUtils { if ( annotateOrigin && priorityListOfVCs == null && originalNumOfVCs == 0) throw new IllegalArgumentException("Cannot merge calls and annotate their origins without a complete priority list of VariantContexts or the number of original VariantContexts"); - if ( genotypeMergeOptions == GenotypeMergeType.REQUIRE_UNIQUE ) - verifyUniqueSampleNames(unsortedVCs); - final List preFilteredVCs = sortVariantContextsByPriority(unsortedVCs, priorityListOfVCs, genotypeMergeOptions); // Make sure all variant contexts are padded with reference base in case of indels if necessary final List VCs = new ArrayList(); @@ -774,18 +773,19 @@ public class VariantContextUtils { } } - static private void verifyUniqueSampleNames(Collection unsortedVCs) { - Set names = new HashSet(); - for ( VariantContext vc : unsortedVCs ) { - for ( String name : vc.getSampleNames() ) { - //System.out.printf("Checking %s %b%n", name, names.contains(name)); - if ( names.contains(name) ) - throw new IllegalStateException("REQUIRE_UNIQUE sample names is true but duplicate names were discovered " + name); - } - - names.addAll(vc.getSampleNames()); - } - } +// TODO: remove that after testing +// static private void verifyUniqueSampleNames(Collection unsortedVCs) { +// Set names = new HashSet(); +// for ( VariantContext vc : unsortedVCs ) { +// for ( String name : vc.getSampleNames() ) { +// //System.out.printf("Checking %s %b%n", name, names.contains(name)); +// if ( names.contains(name) ) +// throw new IllegalStateException("REQUIRE_UNIQUE sample names is true but duplicate names were discovered " + name); +// } +// +// names.addAll(vc.getSampleNames()); +// } +// } static private Allele determineReferenceAllele(List VCs) { diff --git a/public/java/test/org/broadinstitute/variant/variantcontext/VariantContextUtilsUnitTest.java b/public/java/test/org/broadinstitute/variant/variantcontext/VariantContextUtilsUnitTest.java index 9ddb6957c..9c1486cf7 100644 --- a/public/java/test/org/broadinstitute/variant/variantcontext/VariantContextUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/variant/variantcontext/VariantContextUtilsUnitTest.java @@ -542,15 +542,16 @@ public class VariantContextUtilsUnitTest extends BaseTest { Assert.assertEquals(merged.getSampleNames(), new HashSet(Arrays.asList("s1.1", "s1.2"))); } - @Test(expectedExceptions = IllegalStateException.class) - public void testMergeGenotypesRequireUnique() { - final VariantContext vc1 = makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, -1)); - final VariantContext vc2 = makeVC("2", Arrays.asList(Aref, T), makeG("s1", Aref, T, -2)); - - final VariantContext merged = VariantContextUtils.simpleMerge( - Arrays.asList(vc1, vc2), null, VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, - VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE, false, false, "set", false, false); - } +// TODO: remove after testing +// @Test(expectedExceptions = IllegalStateException.class) +// public void testMergeGenotypesRequireUnique() { +// final VariantContext vc1 = makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, -1)); +// final VariantContext vc2 = makeVC("2", Arrays.asList(Aref, T), makeG("s1", Aref, T, -2)); +// +// final VariantContext merged = VariantContextUtils.simpleMerge( +// Arrays.asList(vc1, vc2), null, VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, +// VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE, false, false, "set", false, false); +// } // -------------------------------------------------------------------------------- //