From 6934f83cc748344400c33e5f87d1c8d4a593c74b Mon Sep 17 00:00:00 2001 From: ebanks Date: Thu, 25 Nov 2010 04:52:12 +0000 Subject: [PATCH] Two changes to CombineVariants. 1. Fix: VCs were padded before the merge, but they were never unpadded afterwards. This leaves us with a VC that doesn't meet our spec. 2. Update: instead of running the merged VC through every standard annotation (which seems really wrong, since this isn't the annotator tool), just update the chromosome count annotations (AC,AF,AN) through VCUtils. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4734 348d0f76-0448-11de-a6fe-93d51630548a --- .../variantcontext/VariantContextUtils.java | 68 ++----------------- .../walkers/variantutils/CombineVariants.java | 14 ++-- .../CombineVariantsIntegrationTest.java | 22 +++--- 3 files changed, 19 insertions(+), 85 deletions(-) diff --git a/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/VariantContextUtils.java b/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/VariantContextUtils.java index 7fe985823..e0121cb54 100755 --- a/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/VariantContextUtils.java +++ b/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/VariantContextUtils.java @@ -33,6 +33,7 @@ import org.broad.tribble.util.popgen.HardyWeinbergCalculation; import org.broad.tribble.util.variantcontext.*; import org.broadinstitute.sting.gatk.walkers.phasing.ReadBackedPhasingWalker; import org.broadinstitute.sting.utils.*; +import org.broad.tribble.vcf.VCFCodec; import org.broad.tribble.vcf.VCFConstants; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; @@ -467,6 +468,9 @@ public class VariantContextUtils { attributes.put(VariantContext.ID_KEY, rsID); VariantContext merged = new VariantContext(name, loc.getContig(), loc.getStart(), loc.getStop(), alleles, genotypes, negLog10PError, filters, attributes); + // Trim the padded bases of all alleles if necessary + merged = VCFCodec.createVariantContextWithTrimmedAlleles(merged); + if ( printMessages && remapped ) System.out.printf("Remapped => %s%n", merged); return merged; } @@ -564,70 +568,6 @@ public class VariantContextUtils { } } - - public static VariantContext createVariantContextWithTrimmedAlleles(VariantContext inputVC) { - // see if we need to trim common reference base from all alleles - boolean trimVC = true; - - // We need to trim common reference base from all alleles if a ref base is common to all alleles - Allele refAllele = inputVC.getReference(); - if (!inputVC.isVariant()) - trimVC = false; - else if (refAllele.isNull()) - trimVC = false; - else { - for (Allele a : inputVC.getAlternateAlleles()) { - if (a.length() < 1 || (a.getBases()[0] != refAllele.getBases()[0])) - trimVC = false; - } - } - - // nothing to do if we don't need to trim bases - if (trimVC) { - List alleles = new ArrayList(); - Map genotypes = new TreeMap(); - - Map inputGenotypes = inputVC.getGenotypes(); - // set the reference base for indels in the attributes - Map attributes = new TreeMap(); - - for ( Map.Entry p : inputVC.getAttributes().entrySet() ) { - attributes.put(p.getKey(), p.getValue()); - } - - attributes.put(VariantContext.REFERENCE_BASE_FOR_INDEL_KEY, new Byte(inputVC.getReference().getBases()[0])); - - - for (Allele a : inputVC.getAlleles()) { - // get bases for current allele and create a new one with trimmed bases - byte[] newBases = Arrays.copyOfRange(a.getBases(),1,a.length()); - alleles.add(Allele.create(newBases,a.isReference())); - } - - // now we can recreate new genotypes with trimmed alleles - for (String sample : inputVC.getSampleNames()) { - Genotype g = inputGenotypes.get(sample); - - List inAlleles = g.getAlleles(); - List newGenotypeAlleles = new ArrayList(); - for (Allele a : inAlleles) { - byte[] newBases = Arrays.copyOfRange(a.getBases(),1,a.length()); - newGenotypeAlleles.add(Allele.create(newBases, a.isReference())); - } - genotypes.put(sample, new Genotype(sample, newGenotypeAlleles, g.getNegLog10PError(), - g.getFilters(),g.getAttributes(),g.genotypesArePhased())); - - } - return new VariantContext(inputVC.getSource(), inputVC.getChr(), inputVC.getStart(), inputVC.getEnd(), alleles, genotypes, inputVC.getNegLog10PError(), - inputVC.getFilters(), attributes); - - } - else - return inputVC; - - } - - static class CompareByPriority implements Comparator, Serializable { List priorityListOfVCs; public CompareByPriority(List priorityListOfVCs) { diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java b/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java index f9af5b6b8..048d820c2 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java @@ -27,8 +27,6 @@ package org.broadinstitute.sting.gatk.walkers.variantutils; import org.broad.tribble.util.variantcontext.VariantContext; import org.broad.tribble.vcf.*; -import org.broadinstitute.sting.commandline.CommandLineUtils; -import org.broadinstitute.sting.commandline.Hidden; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils; @@ -37,7 +35,6 @@ import org.broadinstitute.sting.gatk.walkers.Reference; import org.broadinstitute.sting.gatk.walkers.Requires; import org.broadinstitute.sting.gatk.walkers.RodWalker; import org.broadinstitute.sting.gatk.walkers.Window; -import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine; import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.commandline.Argument; import org.broadinstitute.sting.commandline.Output; @@ -83,18 +80,12 @@ public class CombineVariants extends RodWalker { private List priority = null; - private VariantAnnotatorEngine engine; - public void initialize() { validateAnnotateUnionArguments(); Map vcfRods = VCFUtils.getVCFHeadersFromRods(getToolkit(), null); Set samples = SampleUtils.getSampleList(vcfRods, genotypeMergeOption); - ArrayList annotationClassesToUse = new ArrayList(); - annotationClassesToUse.add("Standard"); - engine = new VariantAnnotatorEngine(getToolkit(), annotationClassesToUse, new ArrayList()); - if ( SET_KEY.toLowerCase().equals("null") ) SET_KEY = null; @@ -136,7 +127,10 @@ public class CombineVariants extends RodWalker { //out.printf(" merged => %s%nannotated => %s%n", mergedVC, annotatedMergedVC); if ( mergedVC != null ) { // only operate at the start of events - VariantContext annotatedMergedVC = engine.annotateContext(tracker, ref, mergedVC); + HashMap attributes = new HashMap(mergedVC.getAttributes()); + // re-compute chromosome counts + VariantContextUtils.calculateChromosomeCounts(mergedVC, attributes, false); + VariantContext annotatedMergedVC = VariantContext.modifyAttributes(mergedVC, attributes); if ( minimalVCF ) annotatedMergedVC = VariantContextUtils.pruneVariantContext(annotatedMergedVC, new HashSet(Arrays.asList(SET_KEY))); vcfWriter.add(annotatedMergedVC, ref.getBase()); diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsIntegrationTest.java b/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsIntegrationTest.java index 0f0864a62..3ab4cbae1 100755 --- a/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsIntegrationTest.java +++ b/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsIntegrationTest.java @@ -60,21 +60,21 @@ public class CombineVariantsIntegrationTest extends WalkerTest { } - @Test public void test1SNP() { test1InOut("pilot2.snps.vcf4.genotypes.vcf", "3287ddd7c5003b3c791048cb2532578a"); } - @Test public void test2SNP() { test1InOut("pilot2.snps.vcf4.genotypes.vcf", "1b8b58c2b231926f0ba45d29c5242df5", " -setKey foo"); } - @Test public void test3SNP() { test1InOut("pilot2.snps.vcf4.genotypes.vcf", "5aaa695bc1754d6abcbed27f0c0b4c64", " -setKey null"); } - @Test public void testOfficialCEUPilotCalls() { test1InOut("CEU.trio.2010_03.genotypes.vcf.gz", "79e7e474b0a2bb82930201f143328d5d"); } // official project VCF files in tabix format + @Test public void test1SNP() { test1InOut("pilot2.snps.vcf4.genotypes.vcf", "b9a1887dc8ff20a63a2bae07ea1131f4"); } + @Test public void test2SNP() { test1InOut("pilot2.snps.vcf4.genotypes.vcf", "1dc8fedba840de1335d23300446c1c07", " -setKey foo"); } + @Test public void test3SNP() { test1InOut("pilot2.snps.vcf4.genotypes.vcf", "33afd1b97340dfa192e5f886fcadd76f", " -setKey null"); } + @Test public void testOfficialCEUPilotCalls() { test1InOut("CEU.trio.2010_03.genotypes.vcf.gz", "99796c731462d5a60dd2789a8074fbc0"); } // official project VCF files in tabix format - @Test public void test1Indel1() { test1InOut("CEU.dindel.vcf4.trio.2010_06.indel.genotypes.vcf", "d93ee8b9f36eedc80a3afa548bffc888"); } - @Test public void test1Indel2() { test1InOut("CEU.dindel.vcf4.low_coverage.2010_06.indel.genotypes.vcf", "26c99206407fb2c42beadcac5e5de246"); } + @Test public void test1Indel1() { test1InOut("CEU.dindel.vcf4.trio.2010_06.indel.genotypes.vcf", "79566b450a71b295aef0285c73f36d6c"); } + @Test public void test1Indel2() { test1InOut("CEU.dindel.vcf4.low_coverage.2010_06.indel.genotypes.vcf", "0b08098d0519b26e58137c8b337a5119"); } - @Test public void combineTrioCalls() { combine2("CEU.trio.2010_03.genotypes.vcf.gz", "YRI.trio.2010_03.genotypes.vcf.gz", "", "9b44bb39702b41240371815320e452d5"); } // official project VCF files in tabix format + @Test public void combineTrioCalls() { combine2("CEU.trio.2010_03.genotypes.vcf.gz", "YRI.trio.2010_03.genotypes.vcf.gz", "", "f1749a11f05b383a8df52c98670b2d9a"); } // official project VCF files in tabix format @Test public void combineTrioCallsMin() { combine2("CEU.trio.2010_03.genotypes.vcf.gz", "YRI.trio.2010_03.genotypes.vcf.gz", " -minimalVCF", "460bdbeaf7e9641395ac2ce6e1afc106"); } // official project VCF files in tabix format - @Test public void combine2Indels() { combine2("CEU.dindel.vcf4.trio.2010_06.indel.genotypes.vcf", "CEU.dindel.vcf4.low_coverage.2010_06.indel.genotypes.vcf", "", "02f0634d0176138e4195009eff7f2308"); } + @Test public void combine2Indels() { combine2("CEU.dindel.vcf4.trio.2010_06.indel.genotypes.vcf", "CEU.dindel.vcf4.low_coverage.2010_06.indel.genotypes.vcf", "", "67001f987e8d4aab237c506d6813970e"); } - @Test public void combineSNPsAndIndels() { combine2("CEU.trio.2010_03.genotypes.vcf.gz", "CEU.dindel.vcf4.low_coverage.2010_06.indel.genotypes.vcf", "", "d443358367211b96762ae64d1461b587"); } + @Test public void combineSNPsAndIndels() { combine2("CEU.trio.2010_03.genotypes.vcf.gz", "CEU.dindel.vcf4.low_coverage.2010_06.indel.genotypes.vcf", "", "96e473ce30b3aca3e5f6c51b8fc562ea"); } - @Test public void uniqueSNPs() { combine2("pilot2.snps.vcf4.genotypes.vcf", "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", "", "ff7bdac468045715d4ac0309d8736c23"); } + @Test public void uniqueSNPs() { combine2("pilot2.snps.vcf4.genotypes.vcf", "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", "", "0e6379c4a8c6ddc47ddc8059cf72d7bc"); } @Test public void threeWayWithRefs() { WalkerTestSpec spec = new WalkerTestSpec( @@ -87,7 +87,7 @@ public class CombineVariantsIntegrationTest extends WalkerTest { " -priority NA19240_BGI,NA19240_ILLUMINA,NA19240_WUGSC,denovoInfo" + " -genotypeMergeOptions UNIQUIFY -L 1"), 1, - Arrays.asList("13cb75cf37ec370763a34910ba48e42f")); + Arrays.asList("070dd17f49231576e4f8713e384c8026")); executeTest("threeWayWithRefs", spec); }