From cd058dd10f337ab83919eaaba1f525e1f619b9f7 Mon Sep 17 00:00:00 2001 From: Guillermo del Angel Date: Sat, 24 Sep 2011 13:40:11 -0400 Subject: [PATCH] a) Fixed md5 for legit change in UG output that now also no-calls genotypes w/0,0,0 in PL's in SNP case. b) First reimplementation of new vc merger of different types. Previous version did it in two steps, first merging all vc's per type and then trying to see if resulting vc's would be merged if alleles of one type were a subset of another, but this won't work when uniquifying genotypes since sample names would be messed up and GT sample names wouldn't match VC sample names. Now, it's actually simpler: when splitting vc's by type before merging, we check for alleles of one vc being a subset of alleles of vc of another type and if so we put them together in same list. --- .../walkers/variantutils/CombineVariants.java | 36 ++-------------- .../variantcontext/VariantContextUtils.java | 42 +++++++++++++++++-- .../UnifiedGenotyperIntegrationTest.java | 2 +- 3 files changed, 43 insertions(+), 37 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java index 40b704570..707f940fe 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java @@ -234,47 +234,17 @@ public class CombineVariants extends RodWalker { if (minimumN > 1 && (vcs.size() - numFilteredRecords < minimumN)) return 0; - List preMergedVCs = new ArrayList(); + List mergedVCs = new ArrayList(); Map> VCsByType = VariantContextUtils.separateVariantContextsByType(vcs); // iterate over the types so that it's deterministic for ( VariantContext.Type type : VariantContext.Type.values() ) { if ( VCsByType.containsKey(type) ) - preMergedVCs.add(VariantContextUtils.simpleMerge(getToolkit().getGenomeLocParser(), VCsByType.get(type), + mergedVCs.add(VariantContextUtils.simpleMerge(getToolkit().getGenomeLocParser(), VCsByType.get(type), priority, filteredRecordsMergeType, genotypeMergeOption, true, printComplexMerges, SET_KEY, filteredAreUncalled, MERGE_INFO_WITH_MAX_AC)); } - List mergedVCs = new ArrayList(); - // se have records merged but separated by type. If a particular record is for example a snp but all alleles are a subset of an existing mixed record, - // we will still merge those records. - if (preMergedVCs.size() > 1) { - for (VariantContext vc1 : preMergedVCs) { - VariantContext newvc = vc1; - boolean merged = false; - for (int k=0; k < mergedVCs.size(); k++) { - VariantContext vc2 = mergedVCs.get(k); - - if (VariantContextUtils.allelesAreSubset(vc1,vc2) || VariantContextUtils.allelesAreSubset(vc2,vc1)) { - // all alleles of vc1 are contained in vc2 but they are of different type (say, vc1 is snp, vc2 is complex): try to merget v1 into v2 - List vcpair = new ArrayList(); - vcpair.add(vc1); - vcpair.add(vc2); - newvc = VariantContextUtils.simpleMerge(getToolkit().getGenomeLocParser(), vcpair, - priority, filteredRecordsMergeType, genotypeMergeOption, true, printComplexMerges, - SET_KEY, filteredAreUncalled, MERGE_INFO_WITH_MAX_AC); - mergedVCs.set(k,newvc); - merged = true; - break; - } - } - if (!merged) - mergedVCs.add(vc1); - } - } - else { - mergedVCs = preMergedVCs; - } - for ( VariantContext mergedVC : mergedVCs ) { + for ( VariantContext mergedVC : mergedVCs ) { // only operate at the start of events if ( mergedVC == null ) continue; diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java index 0ca067cec..f4f03e8f0 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java @@ -758,9 +758,45 @@ public class VariantContextUtils { public static Map> separateVariantContextsByType(Collection VCs) { HashMap> mappedVCs = new HashMap>(); for ( VariantContext vc : VCs ) { - if ( !mappedVCs.containsKey(vc.getType()) ) - mappedVCs.put(vc.getType(), new ArrayList()); - mappedVCs.get(vc.getType()).add(vc); + + // look at previous variant contexts of different type. If: + // a) otherVC has alleles which are subset of vc, remove otherVC from its list and add otherVC to vc's list + // b) vc has alleles which are subset of otherVC. Then, add vc to otherVC's type list (rather, do nothing since vc will be added automatically to its list) + // c) neither: do nothing, just add vc to its own list + boolean addtoOwnList = true; + for (VariantContext.Type type : VariantContext.Type.values()) { + if (type.equals(vc.getType())) + continue; + + if (!mappedVCs.containsKey(type)) + continue; + + List vcList = mappedVCs.get(type); + for (int k=0; k < vcList.size(); k++) { + VariantContext otherVC = vcList.get(k); + if (allelesAreSubset(otherVC,vc)) { + // otherVC has a type different than vc and its alleles are a subset of vc: remove otherVC from its list and add it to vc's type list + vcList.remove(k); + // avoid having empty lists + if (vcList.size() == 0) + mappedVCs.remove(vcList); + if ( !mappedVCs.containsKey(vc.getType()) ) + mappedVCs.put(vc.getType(), new ArrayList()); + mappedVCs.get(vc.getType()).add(otherVC); + break; + } + else if (allelesAreSubset(vc,otherVC)) { + // vc has a type different than otherVC and its alleles are a subset of VC: add vc to otherVC's type list and don't add to its own + mappedVCs.get(type).add(vc); + addtoOwnList = false; + } + } + } + if (addtoOwnList) { + if ( !mappedVCs.containsKey(vc.getType()) ) + mappedVCs.put(vc.getType(), new ArrayList()); + mappedVCs.get(vc.getType()).add(vc); + } } return mappedVCs; diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index 41496bdf1..488b3ccd9 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -42,7 +42,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( baseCommand + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "allelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,025,000", 1, - Arrays.asList("ec43daadfb15b00b41aeb0017a45df0b")); + Arrays.asList("6458f3b8fe4954e2ffc2af972aaab19e")); executeTest("test MultiSample Pilot2 with alleles passed in and emitting all sites", spec2); }