In cases where one uses VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE we used to verify that the samples names are unique in VariantContextUtils.simpleMerge for each VCs. It couse to a bug that was reported on the forum (when a VCs had 2 VC from the same sample).

Now we will check it only in CombineVariants.init using the headers. A new function was added to SamplesUtils with unitTests in CVunitTest.java.
This commit is contained in:
Ami Levy-Moonshine 2013-01-25 15:49:51 -05:00
parent fc22a5c71c
commit b4447cdca2
5 changed files with 100 additions and 27 deletions

View File

@ -48,14 +48,14 @@ package org.broadinstitute.sting.gatk.walkers.variantutils;
import org.broad.tribble.readers.AsciiLineReader;
import org.broad.tribble.readers.PositionalBufferedStream;
import org.broadinstitute.sting.utils.SampleUtils;
import org.broadinstitute.variant.vcf.*;
import org.testng.Assert;
import org.testng.annotations.Test;
import java.io.StringBufferInputStream;
import java.util.ArrayList;
import java.util.Set;
import java.util.*;
/**
* test out pieces of the combine variants code
@ -76,6 +76,33 @@ public class CombineVariantsUnitTest {
"##FORMAT=<ID=GQ, Number=1, Type=Integer, Description=\"Genotype quality\">\n"+
"#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n";
public static String VCF4headerStringsWithSamplesName =
"##fileformat=VCFv4.0\n" +
"##filedate=2010-06-21\n"+
"##reference=NCBI36\n"+
"##INFO=<ID=GC, Number=0, Type=Flag, Description=\"Overlap with Gencode CCDS coding sequence\">\n"+
"##INFO=<ID=DP, Number=1, Type=Integer, Description=\"Total number of reads in haplotype window\">\n"+
"##INFO=<ID=AF, Number=1, Type=Float, Description=\"Dindel estimated population allele frequency\">\n"+
"##FILTER=<ID=NoQCALL, Description=\"Variant called by Dindel but not confirmed by QCALL\">\n"+
"##FORMAT=<ID=GT, Number=1, Type=String, Description=\"Genotype\">\n"+
"##FORMAT=<ID=HQ, Number=2, Type=Integer, Description=\"Haplotype quality\">\n"+
"##FORMAT=<ID=GQ, Number=1, Type=Integer, Description=\"Genotype quality\">\n"+
"#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tNA12878\tNA12891\n";
public static String VCF4headerStringsWithUniqueSamplesName =
"##fileformat=VCFv4.0\n" +
"##filedate=2010-06-21\n"+
"##reference=NCBI36\n"+
"##INFO=<ID=GC, Number=0, Type=Flag, Description=\"Overlap with Gencode CCDS coding sequence\">\n"+
"##INFO=<ID=DP, Number=1, Type=Integer, Description=\"Total number of reads in haplotype window\">\n"+
"##INFO=<ID=AF, Number=1, Type=Float, Description=\"Dindel estimated population allele frequency\">\n"+
"##FILTER=<ID=NoQCALL, Description=\"Variant called by Dindel but not confirmed by QCALL\">\n"+
"##FORMAT=<ID=GT, Number=1, Type=String, Description=\"Genotype\">\n"+
"##FORMAT=<ID=HQ, Number=2, Type=Integer, Description=\"Haplotype quality\">\n"+
"##FORMAT=<ID=GQ, Number=1, Type=Integer, Description=\"Genotype quality\">\n"+
"#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tNA12892\n";
// altered info field
public static String VCF4headerStringsBrokenInfo =
"##fileformat=VCFv4.0\n"+
@ -110,6 +137,26 @@ public class CombineVariantsUnitTest {
return head;
}
@Test
public void testHeadersWithSamplesNamesDuplicationThatIsNotAllowed() {
VCFHeader one = createHeader(VCF4headerStringsWithSamplesName);
VCFHeader two = createHeader(VCF4headerStringsWithSamplesName);
Map<String, VCFHeader> headers = new HashMap<String, VCFHeader>();
headers.put("VCF4headerStringsWithSamplesName",one);
headers.put("VCF4headerStringsWithSamplesName2",two);
Assert.assertEquals(SampleUtils.verifyUniqueSamplesNames(headers),false);
}
@Test
public void testHeadersWithoutSamplesNamesDuplication() {
VCFHeader one = createHeader(VCF4headerStringsWithSamplesName);
VCFHeader two = createHeader(VCF4headerStringsWithUniqueSamplesName);
Map<String, VCFHeader> headers = new HashMap<String, VCFHeader>();
headers.put("VCF4headerStringsWithSamplesName",one);
headers.put("VCF4headerStringsWithSamplesName2",two);
Assert.assertEquals(SampleUtils.verifyUniqueSamplesNames(headers),true);
}
@Test
public void testHeadersWhereOneIsAStrictSubsetOfTheOther() {
VCFHeader one = createHeader(VCFHeaderUnitTest.VCF4headerStrings);

View File

@ -208,6 +208,10 @@ public class CombineVariants extends RodWalker<Integer, Integer> implements Tree
logger.info("Priority string is not provided, using arbitrary genotyping order: "+priority);
}
if (genotypeMergeOption == VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE &&
!SampleUtils.verifyUniqueSamplesNames(vcfRods))
throw new IllegalStateException("REQUIRE_UNIQUE sample names is true but duplicate names were discovered.");
samples = sitesOnlyVCF ? Collections.<String>emptySet() : SampleUtils.getSampleList(vcfRods, genotypeMergeOption);
if ( SET_KEY.toLowerCase().equals("null") )
@ -223,6 +227,8 @@ public class CombineVariants extends RodWalker<Integer, Integer> implements Tree
vcfWriter.writeHeader(vcfHeader);
}
private void validateAnnotateUnionArguments() {
Set<String> rodNames = SampleUtils.getRodNamesWithVCFHeader(getToolkit(), null);

View File

@ -132,6 +132,25 @@ public class SampleUtils {
return samples;
}
/**
*
* @param VCF_Headers
* @return false if there are names duplication between the samples names in the VCF headers
*/
public static boolean verifyUniqueSamplesNames(Map<String, VCFHeader> VCF_Headers) {
Set<String> samples = new HashSet<String>();
for ( Map.Entry<String, VCFHeader> val : VCF_Headers.entrySet() ) {
VCFHeader header = val.getValue();
for ( String sample : header.getGenotypeSamples() ) {
if (samples.add(sample))
return false;
}
}
return true;
}
/**
* Gets the sample names from all VCF rods input by the user and uniquifies them if there is overlap
* (e.g. sampleX.1, sampleX.2, ...)

View File

@ -450,7 +450,9 @@ public class VariantContextUtils {
/**
* Merges VariantContexts into a single hybrid. Takes genotypes for common samples in priority order, if provided.
* If uniquifySamples is true, the priority order is ignored and names are created by concatenating the VC name with
* the sample name
* the sample name.
* simpleMerge does not verify any more unique sample names EVEN if genotypeMergeOptions == GenotypeMergeType.REQUIRE_UNIQUE. One should use
* SampleUtils.verifyUniqueSamplesNames to check that before using sempleMerge.
*
* @param unsortedVCs collection of unsorted VCs
* @param priorityListOfVCs priority list detailing the order in which we should grab the VCs
@ -483,9 +485,6 @@ public class VariantContextUtils {
if ( annotateOrigin && priorityListOfVCs == null && originalNumOfVCs == 0)
throw new IllegalArgumentException("Cannot merge calls and annotate their origins without a complete priority list of VariantContexts or the number of original VariantContexts");
if ( genotypeMergeOptions == GenotypeMergeType.REQUIRE_UNIQUE )
verifyUniqueSampleNames(unsortedVCs);
final List<VariantContext> preFilteredVCs = sortVariantContextsByPriority(unsortedVCs, priorityListOfVCs, genotypeMergeOptions);
// Make sure all variant contexts are padded with reference base in case of indels if necessary
final List<VariantContext> VCs = new ArrayList<VariantContext>();
@ -774,18 +773,19 @@ public class VariantContextUtils {
}
}
static private void verifyUniqueSampleNames(Collection<VariantContext> unsortedVCs) {
Set<String> names = new HashSet<String>();
for ( VariantContext vc : unsortedVCs ) {
for ( String name : vc.getSampleNames() ) {
//System.out.printf("Checking %s %b%n", name, names.contains(name));
if ( names.contains(name) )
throw new IllegalStateException("REQUIRE_UNIQUE sample names is true but duplicate names were discovered " + name);
}
names.addAll(vc.getSampleNames());
}
}
// TODO: remove that after testing
// static private void verifyUniqueSampleNames(Collection<VariantContext> unsortedVCs) {
// Set<String> names = new HashSet<String>();
// for ( VariantContext vc : unsortedVCs ) {
// for ( String name : vc.getSampleNames() ) {
// //System.out.printf("Checking %s %b%n", name, names.contains(name));
// if ( names.contains(name) )
// throw new IllegalStateException("REQUIRE_UNIQUE sample names is true but duplicate names were discovered " + name);
// }
//
// names.addAll(vc.getSampleNames());
// }
// }
static private Allele determineReferenceAllele(List<VariantContext> VCs) {

View File

@ -542,15 +542,16 @@ public class VariantContextUtilsUnitTest extends BaseTest {
Assert.assertEquals(merged.getSampleNames(), new HashSet<String>(Arrays.asList("s1.1", "s1.2")));
}
@Test(expectedExceptions = IllegalStateException.class)
public void testMergeGenotypesRequireUnique() {
final VariantContext vc1 = makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, -1));
final VariantContext vc2 = makeVC("2", Arrays.asList(Aref, T), makeG("s1", Aref, T, -2));
final VariantContext merged = VariantContextUtils.simpleMerge(
Arrays.asList(vc1, vc2), null, VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED,
VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE, false, false, "set", false, false);
}
// TODO: remove after testing
// @Test(expectedExceptions = IllegalStateException.class)
// public void testMergeGenotypesRequireUnique() {
// final VariantContext vc1 = makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, -1));
// final VariantContext vc2 = makeVC("2", Arrays.asList(Aref, T), makeG("s1", Aref, T, -2));
//
// final VariantContext merged = VariantContextUtils.simpleMerge(
// Arrays.asList(vc1, vc2), null, VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED,
// VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE, false, false, "set", false, false);
// }
// --------------------------------------------------------------------------------
//