In cases where one uses VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE we used to verify that the samples names are unique in VariantContextUtils.simpleMerge for each VCs. It couse to a bug that was reported on the forum (when a VCs had 2 VC from the same sample).
Now we will check it only in CombineVariants.init using the headers. A new function was added to SamplesUtils with unitTests in CVunitTest.java.
This commit is contained in:
parent
fc22a5c71c
commit
b4447cdca2
|
|
@ -48,14 +48,14 @@ package org.broadinstitute.sting.gatk.walkers.variantutils;
|
|||
|
||||
import org.broad.tribble.readers.AsciiLineReader;
|
||||
import org.broad.tribble.readers.PositionalBufferedStream;
|
||||
import org.broadinstitute.sting.utils.SampleUtils;
|
||||
import org.broadinstitute.variant.vcf.*;
|
||||
import org.testng.Assert;
|
||||
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.io.StringBufferInputStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Set;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* test out pieces of the combine variants code
|
||||
|
|
@ -76,6 +76,33 @@ public class CombineVariantsUnitTest {
|
|||
"##FORMAT=<ID=GQ, Number=1, Type=Integer, Description=\"Genotype quality\">\n"+
|
||||
"#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n";
|
||||
|
||||
public static String VCF4headerStringsWithSamplesName =
|
||||
"##fileformat=VCFv4.0\n" +
|
||||
"##filedate=2010-06-21\n"+
|
||||
"##reference=NCBI36\n"+
|
||||
"##INFO=<ID=GC, Number=0, Type=Flag, Description=\"Overlap with Gencode CCDS coding sequence\">\n"+
|
||||
"##INFO=<ID=DP, Number=1, Type=Integer, Description=\"Total number of reads in haplotype window\">\n"+
|
||||
"##INFO=<ID=AF, Number=1, Type=Float, Description=\"Dindel estimated population allele frequency\">\n"+
|
||||
"##FILTER=<ID=NoQCALL, Description=\"Variant called by Dindel but not confirmed by QCALL\">\n"+
|
||||
"##FORMAT=<ID=GT, Number=1, Type=String, Description=\"Genotype\">\n"+
|
||||
"##FORMAT=<ID=HQ, Number=2, Type=Integer, Description=\"Haplotype quality\">\n"+
|
||||
"##FORMAT=<ID=GQ, Number=1, Type=Integer, Description=\"Genotype quality\">\n"+
|
||||
"#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tNA12878\tNA12891\n";
|
||||
|
||||
public static String VCF4headerStringsWithUniqueSamplesName =
|
||||
"##fileformat=VCFv4.0\n" +
|
||||
"##filedate=2010-06-21\n"+
|
||||
"##reference=NCBI36\n"+
|
||||
"##INFO=<ID=GC, Number=0, Type=Flag, Description=\"Overlap with Gencode CCDS coding sequence\">\n"+
|
||||
"##INFO=<ID=DP, Number=1, Type=Integer, Description=\"Total number of reads in haplotype window\">\n"+
|
||||
"##INFO=<ID=AF, Number=1, Type=Float, Description=\"Dindel estimated population allele frequency\">\n"+
|
||||
"##FILTER=<ID=NoQCALL, Description=\"Variant called by Dindel but not confirmed by QCALL\">\n"+
|
||||
"##FORMAT=<ID=GT, Number=1, Type=String, Description=\"Genotype\">\n"+
|
||||
"##FORMAT=<ID=HQ, Number=2, Type=Integer, Description=\"Haplotype quality\">\n"+
|
||||
"##FORMAT=<ID=GQ, Number=1, Type=Integer, Description=\"Genotype quality\">\n"+
|
||||
"#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tNA12892\n";
|
||||
|
||||
|
||||
// altered info field
|
||||
public static String VCF4headerStringsBrokenInfo =
|
||||
"##fileformat=VCFv4.0\n"+
|
||||
|
|
@ -110,6 +137,26 @@ public class CombineVariantsUnitTest {
|
|||
return head;
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testHeadersWithSamplesNamesDuplicationThatIsNotAllowed() {
|
||||
VCFHeader one = createHeader(VCF4headerStringsWithSamplesName);
|
||||
VCFHeader two = createHeader(VCF4headerStringsWithSamplesName);
|
||||
Map<String, VCFHeader> headers = new HashMap<String, VCFHeader>();
|
||||
headers.put("VCF4headerStringsWithSamplesName",one);
|
||||
headers.put("VCF4headerStringsWithSamplesName2",two);
|
||||
Assert.assertEquals(SampleUtils.verifyUniqueSamplesNames(headers),false);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testHeadersWithoutSamplesNamesDuplication() {
|
||||
VCFHeader one = createHeader(VCF4headerStringsWithSamplesName);
|
||||
VCFHeader two = createHeader(VCF4headerStringsWithUniqueSamplesName);
|
||||
Map<String, VCFHeader> headers = new HashMap<String, VCFHeader>();
|
||||
headers.put("VCF4headerStringsWithSamplesName",one);
|
||||
headers.put("VCF4headerStringsWithSamplesName2",two);
|
||||
Assert.assertEquals(SampleUtils.verifyUniqueSamplesNames(headers),true);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testHeadersWhereOneIsAStrictSubsetOfTheOther() {
|
||||
VCFHeader one = createHeader(VCFHeaderUnitTest.VCF4headerStrings);
|
||||
|
|
|
|||
|
|
@ -208,6 +208,10 @@ public class CombineVariants extends RodWalker<Integer, Integer> implements Tree
|
|||
logger.info("Priority string is not provided, using arbitrary genotyping order: "+priority);
|
||||
}
|
||||
|
||||
if (genotypeMergeOption == VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE &&
|
||||
!SampleUtils.verifyUniqueSamplesNames(vcfRods))
|
||||
throw new IllegalStateException("REQUIRE_UNIQUE sample names is true but duplicate names were discovered.");
|
||||
|
||||
samples = sitesOnlyVCF ? Collections.<String>emptySet() : SampleUtils.getSampleList(vcfRods, genotypeMergeOption);
|
||||
|
||||
if ( SET_KEY.toLowerCase().equals("null") )
|
||||
|
|
@ -223,6 +227,8 @@ public class CombineVariants extends RodWalker<Integer, Integer> implements Tree
|
|||
vcfWriter.writeHeader(vcfHeader);
|
||||
}
|
||||
|
||||
|
||||
|
||||
private void validateAnnotateUnionArguments() {
|
||||
Set<String> rodNames = SampleUtils.getRodNamesWithVCFHeader(getToolkit(), null);
|
||||
|
||||
|
|
|
|||
|
|
@ -132,6 +132,25 @@ public class SampleUtils {
|
|||
return samples;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
*
|
||||
* @param VCF_Headers
|
||||
* @return false if there are names duplication between the samples names in the VCF headers
|
||||
*/
|
||||
public static boolean verifyUniqueSamplesNames(Map<String, VCFHeader> VCF_Headers) {
|
||||
Set<String> samples = new HashSet<String>();
|
||||
for ( Map.Entry<String, VCFHeader> val : VCF_Headers.entrySet() ) {
|
||||
VCFHeader header = val.getValue();
|
||||
for ( String sample : header.getGenotypeSamples() ) {
|
||||
if (samples.add(sample))
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the sample names from all VCF rods input by the user and uniquifies them if there is overlap
|
||||
* (e.g. sampleX.1, sampleX.2, ...)
|
||||
|
|
|
|||
|
|
@ -450,7 +450,9 @@ public class VariantContextUtils {
|
|||
/**
|
||||
* Merges VariantContexts into a single hybrid. Takes genotypes for common samples in priority order, if provided.
|
||||
* If uniquifySamples is true, the priority order is ignored and names are created by concatenating the VC name with
|
||||
* the sample name
|
||||
* the sample name.
|
||||
* simpleMerge does not verify any more unique sample names EVEN if genotypeMergeOptions == GenotypeMergeType.REQUIRE_UNIQUE. One should use
|
||||
* SampleUtils.verifyUniqueSamplesNames to check that before using sempleMerge.
|
||||
*
|
||||
* @param unsortedVCs collection of unsorted VCs
|
||||
* @param priorityListOfVCs priority list detailing the order in which we should grab the VCs
|
||||
|
|
@ -483,9 +485,6 @@ public class VariantContextUtils {
|
|||
if ( annotateOrigin && priorityListOfVCs == null && originalNumOfVCs == 0)
|
||||
throw new IllegalArgumentException("Cannot merge calls and annotate their origins without a complete priority list of VariantContexts or the number of original VariantContexts");
|
||||
|
||||
if ( genotypeMergeOptions == GenotypeMergeType.REQUIRE_UNIQUE )
|
||||
verifyUniqueSampleNames(unsortedVCs);
|
||||
|
||||
final List<VariantContext> preFilteredVCs = sortVariantContextsByPriority(unsortedVCs, priorityListOfVCs, genotypeMergeOptions);
|
||||
// Make sure all variant contexts are padded with reference base in case of indels if necessary
|
||||
final List<VariantContext> VCs = new ArrayList<VariantContext>();
|
||||
|
|
@ -774,18 +773,19 @@ public class VariantContextUtils {
|
|||
}
|
||||
}
|
||||
|
||||
static private void verifyUniqueSampleNames(Collection<VariantContext> unsortedVCs) {
|
||||
Set<String> names = new HashSet<String>();
|
||||
for ( VariantContext vc : unsortedVCs ) {
|
||||
for ( String name : vc.getSampleNames() ) {
|
||||
//System.out.printf("Checking %s %b%n", name, names.contains(name));
|
||||
if ( names.contains(name) )
|
||||
throw new IllegalStateException("REQUIRE_UNIQUE sample names is true but duplicate names were discovered " + name);
|
||||
}
|
||||
|
||||
names.addAll(vc.getSampleNames());
|
||||
}
|
||||
}
|
||||
// TODO: remove that after testing
|
||||
// static private void verifyUniqueSampleNames(Collection<VariantContext> unsortedVCs) {
|
||||
// Set<String> names = new HashSet<String>();
|
||||
// for ( VariantContext vc : unsortedVCs ) {
|
||||
// for ( String name : vc.getSampleNames() ) {
|
||||
// //System.out.printf("Checking %s %b%n", name, names.contains(name));
|
||||
// if ( names.contains(name) )
|
||||
// throw new IllegalStateException("REQUIRE_UNIQUE sample names is true but duplicate names were discovered " + name);
|
||||
// }
|
||||
//
|
||||
// names.addAll(vc.getSampleNames());
|
||||
// }
|
||||
// }
|
||||
|
||||
|
||||
static private Allele determineReferenceAllele(List<VariantContext> VCs) {
|
||||
|
|
|
|||
|
|
@ -542,15 +542,16 @@ public class VariantContextUtilsUnitTest extends BaseTest {
|
|||
Assert.assertEquals(merged.getSampleNames(), new HashSet<String>(Arrays.asList("s1.1", "s1.2")));
|
||||
}
|
||||
|
||||
@Test(expectedExceptions = IllegalStateException.class)
|
||||
public void testMergeGenotypesRequireUnique() {
|
||||
final VariantContext vc1 = makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, -1));
|
||||
final VariantContext vc2 = makeVC("2", Arrays.asList(Aref, T), makeG("s1", Aref, T, -2));
|
||||
|
||||
final VariantContext merged = VariantContextUtils.simpleMerge(
|
||||
Arrays.asList(vc1, vc2), null, VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED,
|
||||
VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE, false, false, "set", false, false);
|
||||
}
|
||||
// TODO: remove after testing
|
||||
// @Test(expectedExceptions = IllegalStateException.class)
|
||||
// public void testMergeGenotypesRequireUnique() {
|
||||
// final VariantContext vc1 = makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, -1));
|
||||
// final VariantContext vc2 = makeVC("2", Arrays.asList(Aref, T), makeG("s1", Aref, T, -2));
|
||||
//
|
||||
// final VariantContext merged = VariantContextUtils.simpleMerge(
|
||||
// Arrays.asList(vc1, vc2), null, VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED,
|
||||
// VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE, false, false, "set", false, false);
|
||||
// }
|
||||
|
||||
// --------------------------------------------------------------------------------
|
||||
//
|
||||
|
|
|
|||
Loading…
Reference in New Issue