From e50627a49e887a52bd85c58656217d5a275cd09f Mon Sep 17 00:00:00 2001 From: ebanks Date: Mon, 12 Jul 2010 17:58:18 +0000 Subject: [PATCH] 1. Updated tests and added integration test for liftover code. 2. Updated liftover code (and scripts) to emit vcf 4.0 and no longer depend on VCFRecord. 3. Beagle walker now also emits vcf 4.0. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3767 348d0f76-0448-11de-a6fe-93d51630548a --- .../variantcontext/VariantContextUtils.java | 3 ++ .../variantutils/FilterLiftedVariants.java | 39 ++++++++-------- .../variantutils/LiftoverVariants.java | 42 +++++++++-------- .../gatk/walkers/BeagleOutputToVCFWalker.java | 2 +- .../sting/utils/genotype/vcf/VCFUtils.java | 4 +- .../variantutils/CombineVariantsUnitTest.java | 7 +-- .../LiftoverVariantsIntegrationTest.java | 46 +++++++++++++++++++ perl/liftOverVCF.pl | 4 +- 8 files changed, 103 insertions(+), 44 deletions(-) create mode 100755 java/test/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariantsIntegrationTest.java diff --git a/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/VariantContextUtils.java b/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/VariantContextUtils.java index 016fde574..3ade0eebf 100755 --- a/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/VariantContextUtils.java +++ b/java/src/org/broadinstitute/sting/gatk/contexts/variantcontext/VariantContextUtils.java @@ -433,4 +433,7 @@ public class VariantContextUtils { return new VariantContext(vc.getName(), vc.getLocation(), vc.getAlleles(), vc.genotypes, vc.getNegLog10PError(), vc.getFilters(), attributes); } + public static VariantContext modifyLocation(VariantContext vc, GenomeLoc loc) { + return new VariantContext(vc.getName(), loc, vc.getAlleles(), vc.genotypes, vc.getNegLog10PError(), vc.getFilters(), vc.getAttributes()); + } } \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/FilterLiftedVariants.java b/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/FilterLiftedVariants.java index b974a4ebd..3fbfbc377 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/FilterLiftedVariants.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/FilterLiftedVariants.java @@ -25,41 +25,46 @@ package org.broadinstitute.sting.gatk.walkers.variantutils; import org.broadinstitute.sting.utils.genotype.vcf.VCFWriter; +import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broad.tribble.vcf.VCFRecord; -import org.broad.tribble.vcf.VCFCodec; +import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext; +import org.broad.tribble.vcf.VCFHeader; -import java.util.List; +import java.util.*; /** * Filters a lifted-over VCF file for ref bases that have been changed. */ -@Requires(value={},referenceMetaData=@RMD(name="vcf",type= VCFRecord.class)) +@Requires(value={},referenceMetaData=@RMD(name="variant",type= ReferenceOrderedDatum.class)) public class FilterLiftedVariants extends RodWalker { private VCFWriter writer; private long failedLocs = 0, totalLocs = 0; - public void initialize() {} + public void initialize() { + Set samples = SampleUtils.getSampleListWithVCFHeader(getToolkit(), Arrays.asList("variant")); + Map vcfHeaders = SampleUtils.getVCFHeadersFromRods(getToolkit(), Arrays.asList("variant")); - private void filterAndWrite(char ref, VCFRecord record) { + writer = new VCFWriter(out, true); + final VCFHeader vcfHeader = new VCFHeader(vcfHeaders.containsKey("variant") ? vcfHeaders.get("variant").getMetaData() : null, samples); + writer.writeHeader(vcfHeader); + } + + private void filterAndWrite(byte ref, VariantContext vc) { totalLocs++; - char recordRef = record.getReference().charAt(0); + byte recordRef = vc.getReference().getBases()[0]; if ( recordRef != ref ) { failedLocs++; } else { - if ( writer == null ) { - writer = new VCFWriter(out); - writer.writeHeader(record.getHeader()); - } - writer.addRecord(record); + writer.add(vc, new byte[]{ref}); } } @@ -67,10 +72,9 @@ public class FilterLiftedVariants extends RodWalker { if ( tracker == null ) return 0; - List rods = tracker.getReferenceMetaData("vcf"); - - for ( Object rod : rods ) - filterAndWrite((char)ref.getBase(), (VCFRecord)rod); + Collection VCs = tracker.getVariantContexts(ref, "variant", null, context.getLocation(), true, false); + for ( VariantContext vc : VCs ) + filterAndWrite(ref.getBase(), vc); return 0; } @@ -80,8 +84,7 @@ public class FilterLiftedVariants extends RodWalker { public Integer reduce(Integer value, Integer sum) { return 0; } public void onTraversalDone(Integer result) { - if ( writer != null ) - writer.close(); + writer.close(); System.out.println("Filtered " + failedLocs + " records out of " + totalLocs + " total records."); } } \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariants.java b/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariants.java index 1308e5283..8ca828daf 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariants.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariants.java @@ -26,15 +26,20 @@ package org.broadinstitute.sting.gatk.walkers.variantutils; import org.broadinstitute.sting.commandline.Argument; import org.broadinstitute.sting.utils.genotype.vcf.VCFWriter; +import org.broadinstitute.sting.utils.genotype.vcf.VCFUtils; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broad.tribble.vcf.VCFRecord; -import org.broad.tribble.vcf.VCFCodec; +import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContext; +import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils; +import org.broad.tribble.vcf.VCFHeader; import java.io.File; -import java.util.List; +import java.util.*; import net.sf.picard.liftover.LiftOver; import net.sf.picard.util.Interval; @@ -44,7 +49,7 @@ import net.sf.samtools.SAMFileReader; /** * Lifts a VCF file over from one build to another. Note that the resulting VCF could be mis-sorted. */ -@Requires(value={},referenceMetaData=@RMD(name="vcf",type= VCFRecord.class)) +@Requires(value={},referenceMetaData=@RMD(name="variant", type=ReferenceOrderedDatum.class)) public class LiftoverVariants extends RodWalker { @Argument(fullName="chain", shortName="chain", doc="Chain file", required=true) @@ -65,20 +70,23 @@ public class LiftoverVariants extends RodWalker { final SAMFileHeader toHeader = new SAMFileReader(NEW_SEQ_DICT).getFileHeader(); liftOver.validateToSequences(toHeader.getSequenceDictionary()); + + Set samples = SampleUtils.getSampleListWithVCFHeader(getToolkit(), Arrays.asList("variant")); + Map vcfHeaders = SampleUtils.getVCFHeadersFromRods(getToolkit(), Arrays.asList("variant")); + + writer = new VCFWriter(out, true); + final VCFHeader vcfHeader = new VCFHeader(vcfHeaders.containsKey("variant") ? vcfHeaders.get("variant").getMetaData() : null, samples); + writer.writeHeader(vcfHeader); } - private void convertAndWrite(VCFRecord record) { + private void convertAndWrite(VariantContext vc, ReferenceContext ref) { - final Interval fromInterval = new Interval(record.getChr(), record.getStart(), record.getEnd()); + final Interval fromInterval = new Interval(vc.getChr(), vc.getStart(), vc.getEnd()); final Interval toInterval = liftOver.liftOver(fromInterval); if ( toInterval != null ) { - record.setLocation(toInterval.getSequence(), toInterval.getStart()); - if ( writer == null ) { - writer = new VCFWriter(out); - writer.writeHeader(record.getHeader()); - } - writer.addRecord(record); + vc = VariantContextUtils.modifyLocation(vc, GenomeLocParser.createGenomeLoc(toInterval.getSequence(), toInterval.getStart(), toInterval.getEnd())); + writer.add(vc, new byte[]{ref.getBase()}); successfulIntervals++; } else { failedIntervals++; @@ -89,10 +97,9 @@ public class LiftoverVariants extends RodWalker { if ( tracker == null ) return 0; - List rods = tracker.getReferenceMetaData("vcf"); - - for ( Object rod : rods ) - convertAndWrite((VCFRecord)rod); + Collection VCs = tracker.getVariantContexts(ref, "variant", null, context.getLocation(), true, false); + for ( VariantContext vc : VCs ) + convertAndWrite(vc, ref); return 0; } @@ -102,8 +109,7 @@ public class LiftoverVariants extends RodWalker { public Integer reduce(Integer value, Integer sum) { return 0; } public void onTraversalDone(Integer result) { - if ( writer != null ) - writer.close(); + writer.close(); System.out.println("Converted " + successfulIntervals + " records; failed to convert " + failedIntervals + " records."); } } diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/BeagleOutputToVCFWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/BeagleOutputToVCFWalker.java index 6d37b48d5..d228de697 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/BeagleOutputToVCFWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/BeagleOutputToVCFWalker.java @@ -82,7 +82,7 @@ public class BeagleOutputToVCFWalker extends RodWalker { hInfo.add(new VCFHeaderLine("source", "BeagleImputation")); // Open output file specified by output VCF ROD - vcfWriter = new VCFWriter(new File(OUTPUT_FILE)); + vcfWriter = new VCFWriter(new File(OUTPUT_FILE), true); Set samples = SampleUtils.getSampleListWithVCFHeader(getToolkit(), Arrays.asList(INPUT_ROD_NAME)); final VCFHeader vcfHeader = new VCFHeader(hInfo, samples); diff --git a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFUtils.java b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFUtils.java index 470880c27..e74664018 100755 --- a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFUtils.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFUtils.java @@ -168,10 +168,10 @@ public class VCFUtils { if (! (compLine).equalsExcludingDescription(compOther) ) throw new IllegalStateException("Incompatible header types, collision between these two types: " + line + " " + other ); if ( ! compLine.getDescription().equals(compOther) ) - logger.warn(String.format("Allowing unequal description fields through: keeping " + compOther + " excluding " + compLine)); + if ( logger != null ) logger.warn(String.format("Allowing unequal description fields through: keeping " + compOther + " excluding " + compLine)); } else { // we are not equal, but we're not anything special either - logger.warn(String.format("Ignoring header line already in map: this header line = " + line + " already present header = " + other)); + if ( logger != null ) logger.warn(String.format("Ignoring header line already in map: this header line = " + line + " already present header = " + other)); } } else { line.setVersion(VCFHeaderVersion.VCF4_0); diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsUnitTest.java b/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsUnitTest.java index 63047555e..367d4daf8 100644 --- a/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsUnitTest.java +++ b/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsUnitTest.java @@ -4,6 +4,7 @@ import org.broad.tribble.vcf.VCFHeader; import org.broad.tribble.vcf.VCFHeaderLine; import org.broadinstitute.sting.gatk.refdata.features.vcf4.VCF4Codec; import org.broadinstitute.sting.utils.genotype.vcf.VCFHeaderUnitTest; +import org.broadinstitute.sting.utils.genotype.vcf.VCFUtils; import org.junit.Assert; import org.junit.Test; @@ -74,7 +75,7 @@ public class CombineVariantsUnitTest { ArrayList headers = new ArrayList(); headers.add(one); headers.add(two); - Set lines = CombineVariants.smartMergeHeaders(headers); + Set lines = VCFUtils.smartMergeHeaders(headers, null); Assert.assertEquals(VCFHeaderUnitTest.VCF4headerStrings.length,lines.size()); } @@ -85,7 +86,7 @@ public class CombineVariantsUnitTest { ArrayList headers = new ArrayList(); headers.add(one); headers.add(two); - Set lines = CombineVariants.smartMergeHeaders(headers); + Set lines = VCFUtils.smartMergeHeaders(headers, null); Assert.assertEquals(VCFHeaderUnitTest.VCF4headerStrings.length,lines.size()); } @@ -96,7 +97,7 @@ public class CombineVariantsUnitTest { ArrayList headers = new ArrayList(); headers.add(one); headers.add(two); - Set lines = CombineVariants.smartMergeHeaders(headers); + Set lines = VCFUtils.smartMergeHeaders(headers, null); Assert.assertEquals(VCFHeaderUnitTest.VCF4headerStrings.length,lines.size()); } } diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariantsIntegrationTest.java b/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariantsIntegrationTest.java new file mode 100755 index 000000000..9e1284332 --- /dev/null +++ b/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariantsIntegrationTest.java @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2010. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.variantutils; + +import org.broadinstitute.sting.WalkerTest; +import org.junit.Test; + +import java.util.Arrays; + +/** + * Tests LiftoverVariants + */ +public class LiftoverVariantsIntegrationTest extends WalkerTest { + + @Test + public void testb36Tohg19() { + WalkerTestSpec spec = new WalkerTestSpec( + "-T LiftoverVariants -o %s -R " + oneKGLocation + "reference/human_b36_both.fasta -B variant,vcf," + validationDataLocation + "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.500.vcf -chain " + validationDataLocation + "b36ToHg19.broad.over.chain -dict " + oneKGLocation + "reference/human_g1k_v37.dict", + 1, + Arrays.asList("aa347e1d379ed090cbd54eef829611f9")); + executeTest("test b36 to hg19", spec); + } +} \ No newline at end of file diff --git a/perl/liftOverVCF.pl b/perl/liftOverVCF.pl index 42f56e632..f0934b9ca 100755 --- a/perl/liftOverVCF.pl +++ b/perl/liftOverVCF.pl @@ -34,7 +34,7 @@ my $unsorted_vcf = "$tmp_prefix.unsorted.vcf"; # lift over the file print "Lifting over the vcf..."; -my $cmd = "java -jar $gatk/dist/GenomeAnalysisTK.jar -T LiftoverVariants -R $oldRef.fasta -B vcf,vcf,$in -o $unsorted_vcf -chain $chain -dict $newRef.dict"; +my $cmd = "java -jar $gatk/dist/GenomeAnalysisTK.jar -T LiftoverVariants -R $oldRef.fasta -B variant,vcf,$in -o $unsorted_vcf -chain $chain -dict $newRef.dict"; system($cmd); # we need to sort the lifted over file now @@ -61,7 +61,7 @@ close(SORTED); # Filter the VCF for bad records print "\nFixing/removing bad records...\n"; -$cmd = "java -jar $gatk/dist/GenomeAnalysisTK.jar -T FilterLiftedVariants -R $newRef.fasta -B vcf,vcf,$sorted_vcf -o $out"; +$cmd = "java -jar $gatk/dist/GenomeAnalysisTK.jar -T FilterLiftedVariants -R $newRef.fasta -B variant,vcf,$sorted_vcf -o $out"; system($cmd); # clean up