Moving these supported perl scripts to public
This commit is contained in:
parent
cfdece6dd8
commit
444eae316c
|
|
@ -0,0 +1,83 @@
|
|||
#!/usr/bin/perl -w
|
||||
|
||||
# Runs the liftover tool on a VCF and properly handles the output
|
||||
|
||||
use strict;
|
||||
use Getopt::Long;
|
||||
|
||||
my $in = undef;
|
||||
my $gatk = undef;
|
||||
my $chain = undef;
|
||||
my $newRef = undef;
|
||||
my $oldRef = undef;
|
||||
my $out = undef;
|
||||
my $tmp = "/tmp";
|
||||
my $recordOriginalLocation = 0;
|
||||
GetOptions( "vcf=s" => \$in,
|
||||
"gatk=s" => \$gatk,
|
||||
"chain=s" => \$chain,
|
||||
"newRef=s" => \$newRef,
|
||||
"oldRef=s" => \$oldRef,
|
||||
"out=s" => \$out,
|
||||
"tmp=s" => \$tmp,
|
||||
"recordOriginalLocation" => \$recordOriginalLocation);
|
||||
|
||||
if ( !$in || !$gatk || !$chain || !$newRef || !$oldRef || !$out ) {
|
||||
print "Usage: liftOverVCF.pl\n\t-vcf \t\t<input vcf>\n\t-gatk \t\t<path to gatk trunk>\n\t-chain \t\t<chain file>\n\t-newRef \t<path to new reference prefix; we will need newRef.dict, .fasta, and .fasta.fai>\n\t-oldRef \t<path to old reference prefix; we will need oldRef.fasta>\n\t-out \t\t<output vcf>\n\t-tmp \t\t<temp file location; defaults to /tmp>\n\t-recordOriginalLocation \t\t<Should we record what the original location was in the INFO field?; defaults to false>\n";
|
||||
print "Example: ./liftOverVCF.pl\n\t-vcf /humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/1kg_snp_validation/all_validation_batches.b36.vcf\n\t-chain b36ToHg19.broad.over.chain\n\t-out lifted.hg19.vcf\n\t-gatk /humgen/gsa-scr1/ebanks/Sting_dev\n\t-newRef /seq/references/Homo_sapiens_assembly19/v0/Homo_sapiens_assembly19\n\t-oldRef /humgen/1kg/reference/human_b36_both\n";
|
||||
exit(1);
|
||||
}
|
||||
|
||||
# generate a random number
|
||||
my $random_number = rand();
|
||||
my $tmp_prefix = "$tmp/$random_number";
|
||||
print "Writing temporary files to prefix: $tmp_prefix\n";
|
||||
my $unsorted_vcf = "$tmp_prefix.unsorted.vcf";
|
||||
|
||||
# lift over the file
|
||||
print "Lifting over the vcf...";
|
||||
my $cmd = "java -jar $gatk/dist/GenomeAnalysisTK.jar -T LiftoverVariants -R $oldRef.fasta -B:variant,vcf $in -o $unsorted_vcf -chain $chain -dict $newRef.dict";
|
||||
if ($recordOriginalLocation) {
|
||||
$cmd .= " -recordOriginalLocation";
|
||||
}
|
||||
system($cmd) == 0 or quit("The liftover step failed. Please correct the necessary errors before retrying.");
|
||||
|
||||
# we need to sort the lifted over file now
|
||||
print "\nRe-sorting the vcf...\n";
|
||||
my $sorted_vcf = "$tmp_prefix.sorted.vcf";
|
||||
open(SORTED, ">$sorted_vcf") or die "can't open $sorted_vcf: $!";
|
||||
|
||||
# write the header
|
||||
open(UNSORTED, "< $unsorted_vcf") or die "can't open $unsorted_vcf: $!";
|
||||
my $inHeader = 1;
|
||||
while ( $inHeader == 1 ) {
|
||||
my $line = <UNSORTED>;
|
||||
if ( $line !~ m/^#/ ) {
|
||||
$inHeader = 0;
|
||||
} else {
|
||||
print SORTED "$line";
|
||||
}
|
||||
}
|
||||
close(UNSORTED);
|
||||
close(SORTED);
|
||||
|
||||
$cmd = "grep \"^#\" -v $unsorted_vcf | sort -n -k2 -T $tmp | $gatk/public/perl/sortByRef.pl --tmp $tmp - $newRef.fasta.fai >> $sorted_vcf";
|
||||
system($cmd) == 0 or quit("The sorting step failed. Please correct the necessary errors before retrying.");
|
||||
|
||||
# Filter the VCF for bad records
|
||||
print "\nFixing/removing bad records...\n";
|
||||
$cmd = "java -jar $gatk/dist/GenomeAnalysisTK.jar -T FilterLiftedVariants -R $newRef.fasta -B:variant,vcf $sorted_vcf -o $out";
|
||||
system($cmd) == 0 or quit("The filtering step failed. Please correct the necessary errors before retrying.");
|
||||
|
||||
# clean up
|
||||
unlink $unsorted_vcf;
|
||||
unlink $sorted_vcf;
|
||||
my $sorted_index = "$sorted_vcf.idx";
|
||||
unlink $sorted_index;
|
||||
|
||||
print "\nDone!\n";
|
||||
|
||||
sub quit {
|
||||
print "\n$_[0]\n";
|
||||
exit(1);
|
||||
}
|
||||
|
|
@ -0,0 +1,127 @@
|
|||
#!/usr/bin/perl -w
|
||||
|
||||
use strict;
|
||||
use Getopt::Long;
|
||||
|
||||
sub usage {
|
||||
|
||||
print "\nUsage:\n";
|
||||
print "sortByRef.pl [--k POS] [--tmp dir] INPUT REF_DICT\n\n";
|
||||
|
||||
print " Sorts lines of the input file INFILE according\n";
|
||||
print " to the reference contig order specified by the\n";
|
||||
print " reference dictionary REF_DICT (.fai file).\n";
|
||||
print " The sort is stable. If -k option is not specified,\n";
|
||||
print " it is assumed that the contig name is the first\n";
|
||||
print " field in each line.\n\n";
|
||||
print " INPUT input file to sort. If '-' is specified, \n";
|
||||
print " then reads from STDIN.\n";
|
||||
print " REF_DICT .fai file, or ANY file that has contigs, in the\n";
|
||||
print " desired soting order, as its first column.\n";
|
||||
print " --k POS : contig name is in the field POS (1-based)\n";
|
||||
print " of input lines.\n\n";
|
||||
print " --tmp DIR : temp directory [default=/tmp]\n\n";
|
||||
|
||||
exit(1);
|
||||
}
|
||||
|
||||
my $pos = 1;
|
||||
my $tmp = "/tmp";
|
||||
GetOptions( "k:i" => \$pos,
|
||||
"tmp=s" => \$tmp);
|
||||
|
||||
$pos--;
|
||||
|
||||
usage() if ( scalar(@ARGV) == 0 );
|
||||
|
||||
if ( scalar(@ARGV) != 2 ) {
|
||||
print "Wrong number of arguments\n";
|
||||
usage();
|
||||
}
|
||||
|
||||
my $input_file = $ARGV[0];
|
||||
my $dict_file = $ARGV[1];
|
||||
|
||||
|
||||
open(DICT, "< $dict_file") or die("Can not open $dict_file: $!");
|
||||
|
||||
my %ref_order;
|
||||
|
||||
my $n = 0;
|
||||
while ( <DICT> ) {
|
||||
chomp;
|
||||
my ($contig, $rest) = split "\t";
|
||||
die("Dictionary file is probably corrupt: multiple instances of contig $contig") if ( defined $ref_order{$contig} );
|
||||
|
||||
$ref_order{$contig} = $n;
|
||||
$n++;
|
||||
}
|
||||
|
||||
close DICT;
|
||||
#we have loaded contig ordering now
|
||||
|
||||
my $INPUT;
|
||||
if ($input_file eq "-" ) {
|
||||
$INPUT = "STDIN";
|
||||
} else {
|
||||
open($INPUT, "< $input_file") or die("Can not open $input_file: $!");
|
||||
}
|
||||
|
||||
my %temp_outputs;
|
||||
|
||||
while ( <$INPUT> ) {
|
||||
|
||||
my @fields = split '\s';
|
||||
die("Specified field position exceeds the number of fields:\n$_")
|
||||
if ( $pos >= scalar(@fields) );
|
||||
|
||||
my $contig = $fields[$pos];
|
||||
if ( $contig =~ m/:/ ) {
|
||||
my @loc = split(/:/, $contig);
|
||||
# print $contig . " " . $loc[0] . "\n";
|
||||
$contig = $loc[0]
|
||||
}
|
||||
chomp $contig if ( $pos == scalar(@fields) - 1 ); # if last field in line
|
||||
|
||||
my $order;
|
||||
if ( defined $ref_order{$contig} ) { $order = $ref_order{$contig}; }
|
||||
else {
|
||||
$ref_order{$contig} = $n;
|
||||
$order = $n; # input line has contig that was not in the dict;
|
||||
$n++; # this contig will go at the end of the output,
|
||||
# after all known contigs
|
||||
}
|
||||
|
||||
my $fhandle;
|
||||
if ( defined $temp_outputs{$order} ) { $fhandle = $temp_outputs{$order} }
|
||||
else {
|
||||
#print "opening $order $$ $_\n";
|
||||
open( $fhandle, " > $tmp/sortByRef.$$.$order.tmp" ) or
|
||||
die ( "Can not open temporary file $order: $!");
|
||||
$temp_outputs{$order} = $fhandle;
|
||||
}
|
||||
|
||||
# we got the handle to the temp file that keeps all
|
||||
# lines with contig $contig
|
||||
|
||||
print $fhandle $_; # send current line to its corresponding temp file
|
||||
}
|
||||
|
||||
close $INPUT;
|
||||
|
||||
foreach my $f ( values %temp_outputs ) { close $f; }
|
||||
|
||||
# now collect back into single output stream:
|
||||
|
||||
for ( my $i = 0 ; $i < $n ; $i++ ) {
|
||||
# if we did not have any lines on contig $i, then there's
|
||||
# no temp file and nothing to do
|
||||
next if ( ! defined $temp_outputs{$i} ) ;
|
||||
|
||||
my $f;
|
||||
open ( $f, "< $tmp/sortByRef.$$.$i.tmp" );
|
||||
while ( <$f> ) { print ; }
|
||||
close $f;
|
||||
|
||||
unlink "$tmp/sortByRef.$$.$i.tmp";
|
||||
}
|
||||
Loading…
Reference in New Issue