From 44de92e09d87cc1bbc60ab0cc0d3031ca849c695 Mon Sep 17 00:00:00 2001 From: ebanks Date: Fri, 7 May 2010 12:31:56 +0000 Subject: [PATCH] Checking in the liftover script. I am including a post-processing walker to filter out bad records written in under 10 minutes as per my agreement with Mark. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3321 348d0f76-0448-11de-a6fe-93d51630548a --- .../gatk/walkers/vcf/FilterLiftedVCF.java | 97 +++++++++++++++++++ .../sting/gatk/walkers/vcf/LiftoverVCF.java | 5 +- perl/liftOverVCF.pl | 71 ++++++++++++++ 3 files changed, 171 insertions(+), 2 deletions(-) create mode 100755 java/src/org/broadinstitute/sting/gatk/walkers/vcf/FilterLiftedVCF.java create mode 100755 perl/liftOverVCF.pl diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/vcf/FilterLiftedVCF.java b/java/src/org/broadinstitute/sting/gatk/walkers/vcf/FilterLiftedVCF.java new file mode 100755 index 000000000..3defaa748 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/walkers/vcf/FilterLiftedVCF.java @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2010. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.vcf; + +import org.broadinstitute.sting.utils.genotype.vcf.VCFWriter; +import org.broadinstitute.sting.utils.BaseUtils; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.*; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broad.tribble.vcf.VCFRecord; +import org.broad.tribble.vcf.VCFCodec; + +import java.util.List; + +/** + * Filters a lifted-over VCF file for ref bases that have been changed. + */ +@Requires(value={},referenceMetaData=@RMD(name="vcf",type= VCFCodec.class)) +public class FilterLiftedVCF extends RodWalker { + + private VCFWriter writer; + + private long failedLocs = 0, totalLocs = 0; + + public void initialize() {} + + private void filterAndWrite(char ref, VCFRecord record) { + + totalLocs++; + + char recordRef = record.getReference().charAt(0); + + if ( recordRef != ref ) { + + // is it reverse complemented? + if ( BaseUtils.simpleComplement(recordRef) == ref ) { + record.setFilterString(record.isFiltered() ? String.format("%s;LiftoverToReverseComplementRefBase", record.getFilterString()) : "LiftoverToNewBase"); + failedLocs++; + } else { + record.setFilterString(record.isFiltered() ? String.format("%s;LiftoverToDifferentRefBase", record.getFilterString()) : "LiftoverToNewBase"); + failedLocs++; + } + } + + if ( writer == null ) { + writer = new VCFWriter(out); + writer.writeHeader(record.getHeader()); + } + writer.addRecord(record); + } + + public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + if ( tracker == null ) + return 0; + + List rods = tracker.getReferenceMetaData("vcf"); + + for ( Object rod : rods ) + filterAndWrite(ref.getBase(), (VCFRecord)rod); + + return 0; + } + + public Integer reduceInit() { return 0; } + + public Integer reduce(Integer value, Integer sum) { return 0; } + + public void onTraversalDone(Integer result) { + if ( writer != null ) + writer.close(); + System.out.println("Filtered " + failedLocs + " records out of " + totalLocs + " total records."); + } +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/vcf/LiftoverVCF.java b/java/src/org/broadinstitute/sting/gatk/walkers/vcf/LiftoverVCF.java index c3680dae7..951a0c090 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/vcf/LiftoverVCF.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/vcf/LiftoverVCF.java @@ -103,7 +103,8 @@ public class LiftoverVCF extends RodWalker { public Integer reduce(Integer value, Integer sum) { return 0; } public void onTraversalDone(Integer result) { - writer.close(); - System.out.println("Converted " + successfulIntervals + " intervals; failed to convert " + failedIntervals + " intervals."); + if ( writer != null ) + writer.close(); + System.out.println("Converted " + successfulIntervals + " records; failed to convert " + failedIntervals + " records."); } } diff --git a/perl/liftOverVCF.pl b/perl/liftOverVCF.pl new file mode 100755 index 000000000..b38a379ff --- /dev/null +++ b/perl/liftOverVCF.pl @@ -0,0 +1,71 @@ +#!/usr/bin/perl -w + +# Runs the liftover tool on a VCF and properly handles the output + +use strict; +use Getopt::Long; + +my $in = undef; +my $gatk = undef; +my $chain = undef; +my $newRef = undef; +my $oldRef = undef; +my $out = undef; +my $tmp = "/tmp"; +GetOptions( "vcf=s" => \$in, + "gatk=s" => \$gatk, + "chain=s" => \$chain, + "newRef=s" => \$newRef, + "oldRef=s" => \$oldRef, + "out=s" => \$out, + "tmp=s" => \$tmp); + +if ( !$in || !$gatk || !$chain || !$newRef || !$oldRef || !$out ) { + print "Usage: liftOverVCF.pl\n\t-vcf \t\t\n\t-gatk \t\t\n\t-chain \t\t\n\t-newRef \t\n\t-oldRef \t\n\t-out \t\t\n\t-tmp "; + print "Example: ./liftOverVCF.pl\n\t-vcf /humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/1kg_snp_validation/all_validation_batches.b36.vcf\n\t-chain b36ToHg19.broad.over.chain\n\t-out lifted.hg19.vcf\n\t-gatk /humgen/gsa-scr1/ebanks/Sting_dev\n\t-newRef /seq/references/Homo_sapiens_assembly19/v0/Homo_sapiens_assembly19\n\t-oldRef /broad/1KG/reference/human_b36_both\n"; + exit(1); +} + +# generate a random number +my $random_number = rand(); +my $tmp_prefix = "$tmp/$random_number"; +print "Writing temporary files to prefix: $tmp_prefix\n"; +my $unsorted_vcf = "$tmp_prefix.unsorted.vcf"; + +# lift over the file +print "Lifting over the vcf..."; +my $cmd = "java -jar $gatk/dist/GenomeAnalysisTK.jar -T LiftoverVCF -R $oldRef.fasta -B vcf,vcf,$in -o $unsorted_vcf -chain $chain -dict $newRef.dict"; +system($cmd); + +# we need to sort the lifted over file now +print "\nRe-sorting the vcf...\n"; +my $sorted_vcf = "$tmp_prefix.sorted.vcf"; +open(SORTED, ">$sorted_vcf") or die "can't open $sorted_vcf: $!"; + +# write the header +open(UNSORTED, "< $unsorted_vcf") or die "can't open $unsorted_vcf: $!"; +my $inHeader = 1; +while ( $inHeader == 1 ) { + my $line = ; + if ( $line !~ m/^#/ ) { + $inHeader = 0; + } else { + print SORTED "$line"; + } +} +close(UNSORTED); + +$cmd = "grep \"^#\" -v $unsorted_vcf | sort -n +1 | $gatk/perl/sortByRef.pl - $newRef.fasta.fai"; +print SORTED `$cmd`; +close(SORTED); + +# Filter the VCF for bad records +print "\nFixing/removing bad records...\n"; +$cmd = "java -jar $gatk/dist/GenomeAnalysisTK.jar -T FilterLiftedVCF -R $newRef.fasta -B vcf,vcf,$sorted_vcf -o $out"; +system($cmd); + +# clean up +unlink $unsorted_vcf; +unlink $sorted_vcf; + +print "\nDone!\n";