diff --git a/perl/liftOverVCF.pl b/perl/liftOverVCF.pl index e02ba98a8..4b3627612 100755 --- a/perl/liftOverVCF.pl +++ b/perl/liftOverVCF.pl @@ -11,6 +11,7 @@ my $chain = undef; my $newRef = undef; my $oldRef = undef; my $out = undef; +my $cacheSize = 50000; my $tmp = "/tmp"; GetOptions( "vcf=s" => \$in, "gatk=s" => \$gatk, @@ -18,11 +19,12 @@ GetOptions( "vcf=s" => \$in, "newRef=s" => \$newRef, "oldRef=s" => \$oldRef, "out=s" => \$out, + "cache=s" => \$cacheSize, "tmp=s" => \$tmp); if ( !$in || !$gatk || !$chain || !$newRef || !$oldRef || !$out ) { - print "Usage: liftOverVCF.pl\n\t-vcf \t\t\n\t-gatk \t\t\n\t-chain \t\t\n\t-newRef \t\n\t-oldRef \t\n\t-out \t\t\n\t-tmp \t\t\n"; - print "Example: ./liftOverVCF.pl\n\t-vcf /humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/1kg_snp_validation/all_validation_batches.b36.vcf\n\t-chain b36ToHg19.broad.over.chain\n\t-out lifted.hg19.vcf\n\t-gatk /humgen/gsa-scr1/ebanks/Sting_dev\n\t-newRef /seq/references/Homo_sapiens_assembly19/v0/Homo_sapiens_assembly19\n\t-oldRef /humgen/1kg/reference/human_b36_both\n"; + print "Usage: liftOverVCF.pl\n\t-vcf \t\t\n\t-gatk \t\t\n\t-chain \t\t\n\t-newRef \t\n\t-oldRef \t\n\t-out \t\t\n\t-cache \t\t\n\t-tmp \t\t\n"; + print "Example: ./liftOverVCF.pl\n\t-vcf /humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/1kg_snp_validation/all_validation_batches.b36.vcf\n\t-chain b36ToHg19.broad.over.chain\n\t-out lifted.hg19.vcf\n\t-gatk /humgen/gsa-scr1/ebanks/Sting_dev\n\t-newRef /seq/references/Homo_sapiens_assembly19/v0/Homo_sapiens_assembly19\n\t-oldRef /humgen/1kg/reference/human_b36_both\n\t-cache 60000\n"; exit(1); } @@ -30,42 +32,19 @@ if ( !$in || !$gatk || !$chain || !$newRef || !$oldRef || !$out ) { my $random_number = rand(); my $tmp_prefix = "$tmp/$random_number"; print "Writing temporary files to prefix: $tmp_prefix\n"; -my $unsorted_vcf = "$tmp_prefix.unsorted.vcf"; +my $lifted_vcf = "$tmp_prefix.lifted.vcf"; # lift over the file print "Lifting over the vcf..."; -my $cmd = "java -jar $gatk/dist/GenomeAnalysisTK.jar -T LiftoverVariants -R $oldRef.fasta -B:variant,vcf $in -o $unsorted_vcf -chain $chain -dict $newRef.dict"; +my $cmd = "java -jar $gatk/dist/GenomeAnalysisTK.jar -T LiftoverVariants -R $oldRef.fasta -B:variant,vcf $in -o $lifted_vcf -chain $chain -dict $newRef.dict -cache $cacheSize"; system($cmd); -# we need to sort the lifted over file now -print "\nRe-sorting the vcf...\n"; -my $sorted_vcf = "$tmp_prefix.sorted.vcf"; -open(SORTED, ">$sorted_vcf") or die "can't open $sorted_vcf: $!"; - -# write the header -open(UNSORTED, "< $unsorted_vcf") or die "can't open $unsorted_vcf: $!"; -my $inHeader = 1; -while ( $inHeader == 1 ) { - my $line = ; - if ( $line !~ m/^#/ ) { - $inHeader = 0; - } else { - print SORTED "$line"; - } -} -close(UNSORTED); - -$cmd = "grep \"^#\" -v $unsorted_vcf | sort -n -k2 -T $tmp | $gatk/perl/sortByRef.pl --tmp $tmp - $newRef.fasta.fai"; -print SORTED `$cmd`; -close(SORTED); - # Filter the VCF for bad records print "\nFixing/removing bad records...\n"; -$cmd = "java -jar $gatk/dist/GenomeAnalysisTK.jar -T FilterLiftedVariants -R $newRef.fasta -B:variant,vcf $sorted_vcf -o $out"; +$cmd = "java -jar $gatk/dist/GenomeAnalysisTK.jar -T FilterLiftedVariants -R $newRef.fasta -B:variant,vcf $lifted_vcf -o $out"; system($cmd); # clean up -unlink $unsorted_vcf; -unlink $sorted_vcf; +unlink $lifted_vcf; print "\nDone!\n";