diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariants.java b/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariants.java index 899194cef..bf0084fd0 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariants.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariants.java @@ -30,15 +30,13 @@ import org.broadinstitute.sting.commandline.Output; import org.broadinstitute.sting.utils.vcf.VCFUtils; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.SampleUtils; -import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils; import org.broad.tribble.vcf.VCFHeader; -import org.broad.tribble.vcf.SortingVCFWriter; -import org.broad.tribble.TribbleException; +import org.broad.tribble.vcf.StandardVCFWriter; import java.io.File; import java.util.*; @@ -56,7 +54,7 @@ public class LiftoverVariants extends RodWalker { @Output(doc="File to which variants should be written",required=true) protected File file = null; - protected SortingVCFWriter writer = null; + protected StandardVCFWriter writer = null; @Argument(fullName="chain", shortName="chain", doc="Chain file", required=true) protected File CHAIN = null; @@ -64,9 +62,6 @@ public class LiftoverVariants extends RodWalker { @Argument(fullName="newSequenceDictionary", shortName="dict", doc="Sequence .dict file for the new build", required=true) protected File NEW_SEQ_DICT = null; - @Argument(fullName="maxCachingDistance", shortName="cache", doc="Maximum genomic distance for which we will cache variants", required=false) - protected Integer maxCachingDistance = 50000; - private LiftOver liftOver; private long successfulIntervals = 0, failedIntervals = 0; @@ -82,8 +77,7 @@ public class LiftoverVariants extends RodWalker { Map vcfHeaders = VCFUtils.getVCFHeadersFromRods(getToolkit(), Arrays.asList("variant")); final VCFHeader vcfHeader = new VCFHeader(vcfHeaders.containsKey("variant") ? vcfHeaders.get("variant").getMetaData() : null, samples); - - writer = new SortingVCFWriter(file, maxCachingDistance); + writer = new StandardVCFWriter(file, false); writer.writeHeader(vcfHeader); } @@ -95,11 +89,7 @@ public class LiftoverVariants extends RodWalker { if ( toInterval != null ) { vc = VariantContextUtils.modifyLocation(vc, GenomeLocParser.createPotentiallyInvalidGenomeLoc(toInterval.getSequence(), toInterval.getStart(), toInterval.getStart() + length)); - try { - writer.add(vc, ref.getBase()); - } catch (TribbleException e) { - throw new UserException(e.getMessage()); - } + writer.add(vc, ref.getBase()); successfulIntervals++; } else { failedIntervals++; @@ -122,8 +112,7 @@ public class LiftoverVariants extends RodWalker { public Integer reduce(Integer value, Integer sum) { return 0; } public void onTraversalDone(Integer result) { - writer.flush(); - writer.close(); System.out.println("Converted " + successfulIntervals + " records; failed to convert " + failedIntervals + " records."); + writer.close(); } } diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariantsIntegrationTest.java b/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariantsIntegrationTest.java index 785763dbd..2cd588003 100755 --- a/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariantsIntegrationTest.java +++ b/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariantsIntegrationTest.java @@ -49,7 +49,7 @@ public class LiftoverVariantsIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( "-T LiftoverVariants -o %s -R " + hg18Reference + " -B:variant,vcf " + validationDataLocation + "liftover_test.vcf -chain " + validationDataLocation + "hg18ToHg19.broad.over.chain -dict /seq/references/Homo_sapiens_assembly19/v0/Homo_sapiens_assembly19.dict", 1, - Arrays.asList("ce71a6d40f7375826e31adb3832d2fe3")); + Arrays.asList("22d4cd78d72da55f2423b9ab06f467b2")); executeTest("test hg18 to hg19, unsorted", spec); } } \ No newline at end of file diff --git a/perl/liftOverVCF.pl b/perl/liftOverVCF.pl index 4b3627612..e02ba98a8 100755 --- a/perl/liftOverVCF.pl +++ b/perl/liftOverVCF.pl @@ -11,7 +11,6 @@ my $chain = undef; my $newRef = undef; my $oldRef = undef; my $out = undef; -my $cacheSize = 50000; my $tmp = "/tmp"; GetOptions( "vcf=s" => \$in, "gatk=s" => \$gatk, @@ -19,12 +18,11 @@ GetOptions( "vcf=s" => \$in, "newRef=s" => \$newRef, "oldRef=s" => \$oldRef, "out=s" => \$out, - "cache=s" => \$cacheSize, "tmp=s" => \$tmp); if ( !$in || !$gatk || !$chain || !$newRef || !$oldRef || !$out ) { - print "Usage: liftOverVCF.pl\n\t-vcf \t\t\n\t-gatk \t\t\n\t-chain \t\t\n\t-newRef \t\n\t-oldRef \t\n\t-out \t\t\n\t-cache \t\t\n\t-tmp \t\t\n"; - print "Example: ./liftOverVCF.pl\n\t-vcf /humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/1kg_snp_validation/all_validation_batches.b36.vcf\n\t-chain b36ToHg19.broad.over.chain\n\t-out lifted.hg19.vcf\n\t-gatk /humgen/gsa-scr1/ebanks/Sting_dev\n\t-newRef /seq/references/Homo_sapiens_assembly19/v0/Homo_sapiens_assembly19\n\t-oldRef /humgen/1kg/reference/human_b36_both\n\t-cache 60000\n"; + print "Usage: liftOverVCF.pl\n\t-vcf \t\t\n\t-gatk \t\t\n\t-chain \t\t\n\t-newRef \t\n\t-oldRef \t\n\t-out \t\t\n\t-tmp \t\t\n"; + print "Example: ./liftOverVCF.pl\n\t-vcf /humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/1kg_snp_validation/all_validation_batches.b36.vcf\n\t-chain b36ToHg19.broad.over.chain\n\t-out lifted.hg19.vcf\n\t-gatk /humgen/gsa-scr1/ebanks/Sting_dev\n\t-newRef /seq/references/Homo_sapiens_assembly19/v0/Homo_sapiens_assembly19\n\t-oldRef /humgen/1kg/reference/human_b36_both\n"; exit(1); } @@ -32,19 +30,42 @@ if ( !$in || !$gatk || !$chain || !$newRef || !$oldRef || !$out ) { my $random_number = rand(); my $tmp_prefix = "$tmp/$random_number"; print "Writing temporary files to prefix: $tmp_prefix\n"; -my $lifted_vcf = "$tmp_prefix.lifted.vcf"; +my $unsorted_vcf = "$tmp_prefix.unsorted.vcf"; # lift over the file print "Lifting over the vcf..."; -my $cmd = "java -jar $gatk/dist/GenomeAnalysisTK.jar -T LiftoverVariants -R $oldRef.fasta -B:variant,vcf $in -o $lifted_vcf -chain $chain -dict $newRef.dict -cache $cacheSize"; +my $cmd = "java -jar $gatk/dist/GenomeAnalysisTK.jar -T LiftoverVariants -R $oldRef.fasta -B:variant,vcf $in -o $unsorted_vcf -chain $chain -dict $newRef.dict"; system($cmd); +# we need to sort the lifted over file now +print "\nRe-sorting the vcf...\n"; +my $sorted_vcf = "$tmp_prefix.sorted.vcf"; +open(SORTED, ">$sorted_vcf") or die "can't open $sorted_vcf: $!"; + +# write the header +open(UNSORTED, "< $unsorted_vcf") or die "can't open $unsorted_vcf: $!"; +my $inHeader = 1; +while ( $inHeader == 1 ) { + my $line = ; + if ( $line !~ m/^#/ ) { + $inHeader = 0; + } else { + print SORTED "$line"; + } +} +close(UNSORTED); + +$cmd = "grep \"^#\" -v $unsorted_vcf | sort -n -k2 -T $tmp | $gatk/perl/sortByRef.pl --tmp $tmp - $newRef.fasta.fai"; +print SORTED `$cmd`; +close(SORTED); + # Filter the VCF for bad records print "\nFixing/removing bad records...\n"; -$cmd = "java -jar $gatk/dist/GenomeAnalysisTK.jar -T FilterLiftedVariants -R $newRef.fasta -B:variant,vcf $lifted_vcf -o $out"; +$cmd = "java -jar $gatk/dist/GenomeAnalysisTK.jar -T FilterLiftedVariants -R $newRef.fasta -B:variant,vcf $sorted_vcf -o $out"; system($cmd); # clean up -unlink $lifted_vcf; +unlink $unsorted_vcf; +unlink $sorted_vcf; print "\nDone!\n";