From 444eae316c8d33e4b3ff765aa5b7211f5bd746a9 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Fri, 1 Jul 2011 17:26:25 -0400 Subject: [PATCH] Moving these supported perl scripts to public --- public/perl/liftOverVCF.pl | 83 ++++++++++++++++++++++++ public/perl/sortByRef.pl | 127 +++++++++++++++++++++++++++++++++++++ 2 files changed, 210 insertions(+) create mode 100755 public/perl/liftOverVCF.pl create mode 100755 public/perl/sortByRef.pl diff --git a/public/perl/liftOverVCF.pl b/public/perl/liftOverVCF.pl new file mode 100755 index 000000000..21cb8bb6b --- /dev/null +++ b/public/perl/liftOverVCF.pl @@ -0,0 +1,83 @@ +#!/usr/bin/perl -w + +# Runs the liftover tool on a VCF and properly handles the output + +use strict; +use Getopt::Long; + +my $in = undef; +my $gatk = undef; +my $chain = undef; +my $newRef = undef; +my $oldRef = undef; +my $out = undef; +my $tmp = "/tmp"; +my $recordOriginalLocation = 0; +GetOptions( "vcf=s" => \$in, + "gatk=s" => \$gatk, + "chain=s" => \$chain, + "newRef=s" => \$newRef, + "oldRef=s" => \$oldRef, + "out=s" => \$out, + "tmp=s" => \$tmp, + "recordOriginalLocation" => \$recordOriginalLocation); + +if ( !$in || !$gatk || !$chain || !$newRef || !$oldRef || !$out ) { + print "Usage: liftOverVCF.pl\n\t-vcf \t\t\n\t-gatk \t\t\n\t-chain \t\t\n\t-newRef \t\n\t-oldRef \t\n\t-out \t\t\n\t-tmp \t\t\n\t-recordOriginalLocation \t\t\n"; + print "Example: ./liftOverVCF.pl\n\t-vcf /humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/1kg_snp_validation/all_validation_batches.b36.vcf\n\t-chain b36ToHg19.broad.over.chain\n\t-out lifted.hg19.vcf\n\t-gatk /humgen/gsa-scr1/ebanks/Sting_dev\n\t-newRef /seq/references/Homo_sapiens_assembly19/v0/Homo_sapiens_assembly19\n\t-oldRef /humgen/1kg/reference/human_b36_both\n"; + exit(1); +} + +# generate a random number +my $random_number = rand(); +my $tmp_prefix = "$tmp/$random_number"; +print "Writing temporary files to prefix: $tmp_prefix\n"; +my $unsorted_vcf = "$tmp_prefix.unsorted.vcf"; + +# lift over the file +print "Lifting over the vcf..."; +my $cmd = "java -jar $gatk/dist/GenomeAnalysisTK.jar -T LiftoverVariants -R $oldRef.fasta -B:variant,vcf $in -o $unsorted_vcf -chain $chain -dict $newRef.dict"; +if ($recordOriginalLocation) { + $cmd .= " -recordOriginalLocation"; +} +system($cmd) == 0 or quit("The liftover step failed. Please correct the necessary errors before retrying."); + +# we need to sort the lifted over file now +print "\nRe-sorting the vcf...\n"; +my $sorted_vcf = "$tmp_prefix.sorted.vcf"; +open(SORTED, ">$sorted_vcf") or die "can't open $sorted_vcf: $!"; + +# write the header +open(UNSORTED, "< $unsorted_vcf") or die "can't open $unsorted_vcf: $!"; +my $inHeader = 1; +while ( $inHeader == 1 ) { + my $line = ; + if ( $line !~ m/^#/ ) { + $inHeader = 0; + } else { + print SORTED "$line"; + } +} +close(UNSORTED); +close(SORTED); + +$cmd = "grep \"^#\" -v $unsorted_vcf | sort -n -k2 -T $tmp | $gatk/public/perl/sortByRef.pl --tmp $tmp - $newRef.fasta.fai >> $sorted_vcf"; +system($cmd) == 0 or quit("The sorting step failed. Please correct the necessary errors before retrying."); + +# Filter the VCF for bad records +print "\nFixing/removing bad records...\n"; +$cmd = "java -jar $gatk/dist/GenomeAnalysisTK.jar -T FilterLiftedVariants -R $newRef.fasta -B:variant,vcf $sorted_vcf -o $out"; +system($cmd) == 0 or quit("The filtering step failed. Please correct the necessary errors before retrying."); + +# clean up +unlink $unsorted_vcf; +unlink $sorted_vcf; +my $sorted_index = "$sorted_vcf.idx"; +unlink $sorted_index; + +print "\nDone!\n"; + +sub quit { + print "\n$_[0]\n"; + exit(1); +} diff --git a/public/perl/sortByRef.pl b/public/perl/sortByRef.pl new file mode 100755 index 000000000..71d3f4477 --- /dev/null +++ b/public/perl/sortByRef.pl @@ -0,0 +1,127 @@ +#!/usr/bin/perl -w + +use strict; +use Getopt::Long; + +sub usage { + + print "\nUsage:\n"; + print "sortByRef.pl [--k POS] [--tmp dir] INPUT REF_DICT\n\n"; + + print " Sorts lines of the input file INFILE according\n"; + print " to the reference contig order specified by the\n"; + print " reference dictionary REF_DICT (.fai file).\n"; + print " The sort is stable. If -k option is not specified,\n"; + print " it is assumed that the contig name is the first\n"; + print " field in each line.\n\n"; + print " INPUT input file to sort. If '-' is specified, \n"; + print " then reads from STDIN.\n"; + print " REF_DICT .fai file, or ANY file that has contigs, in the\n"; + print " desired soting order, as its first column.\n"; + print " --k POS : contig name is in the field POS (1-based)\n"; + print " of input lines.\n\n"; + print " --tmp DIR : temp directory [default=/tmp]\n\n"; + + exit(1); +} + +my $pos = 1; +my $tmp = "/tmp"; +GetOptions( "k:i" => \$pos, + "tmp=s" => \$tmp); + +$pos--; + +usage() if ( scalar(@ARGV) == 0 ); + +if ( scalar(@ARGV) != 2 ) { + print "Wrong number of arguments\n"; + usage(); +} + +my $input_file = $ARGV[0]; +my $dict_file = $ARGV[1]; + + +open(DICT, "< $dict_file") or die("Can not open $dict_file: $!"); + +my %ref_order; + +my $n = 0; +while ( ) { + chomp; + my ($contig, $rest) = split "\t"; + die("Dictionary file is probably corrupt: multiple instances of contig $contig") if ( defined $ref_order{$contig} ); + + $ref_order{$contig} = $n; + $n++; +} + +close DICT; +#we have loaded contig ordering now + +my $INPUT; +if ($input_file eq "-" ) { + $INPUT = "STDIN"; +} else { + open($INPUT, "< $input_file") or die("Can not open $input_file: $!"); +} + +my %temp_outputs; + +while ( <$INPUT> ) { + + my @fields = split '\s'; + die("Specified field position exceeds the number of fields:\n$_") + if ( $pos >= scalar(@fields) ); + + my $contig = $fields[$pos]; + if ( $contig =~ m/:/ ) { + my @loc = split(/:/, $contig); + # print $contig . " " . $loc[0] . "\n"; + $contig = $loc[0] + } + chomp $contig if ( $pos == scalar(@fields) - 1 ); # if last field in line + + my $order; + if ( defined $ref_order{$contig} ) { $order = $ref_order{$contig}; } + else { + $ref_order{$contig} = $n; + $order = $n; # input line has contig that was not in the dict; + $n++; # this contig will go at the end of the output, + # after all known contigs + } + + my $fhandle; + if ( defined $temp_outputs{$order} ) { $fhandle = $temp_outputs{$order} } + else { + #print "opening $order $$ $_\n"; + open( $fhandle, " > $tmp/sortByRef.$$.$order.tmp" ) or + die ( "Can not open temporary file $order: $!"); + $temp_outputs{$order} = $fhandle; + } + + # we got the handle to the temp file that keeps all + # lines with contig $contig + + print $fhandle $_; # send current line to its corresponding temp file +} + +close $INPUT; + +foreach my $f ( values %temp_outputs ) { close $f; } + +# now collect back into single output stream: + +for ( my $i = 0 ; $i < $n ; $i++ ) { + # if we did not have any lines on contig $i, then there's + # no temp file and nothing to do + next if ( ! defined $temp_outputs{$i} ) ; + + my $f; + open ( $f, "< $tmp/sortByRef.$$.$i.tmp" ); + while ( <$f> ) { print ; } + close $f; + + unlink "$tmp/sortByRef.$$.$i.tmp"; +}