2010-05-07 20:31:56 +08:00
#!/usr/bin/perl -w
# Runs the liftover tool on a VCF and properly handles the output
use strict ;
use Getopt::Long ;
my $ in = undef ;
my $ gatk = undef ;
my $ chain = undef ;
my $ newRef = undef ;
my $ oldRef = undef ;
my $ out = undef ;
my $ tmp = "/tmp" ;
GetOptions ( "vcf=s" = > \ $ in ,
"gatk=s" = > \ $ gatk ,
"chain=s" = > \ $ chain ,
"newRef=s" = > \ $ newRef ,
"oldRef=s" = > \ $ oldRef ,
"out=s" = > \ $ out ,
"tmp=s" = > \ $ tmp ) ;
if ( ! $ in || ! $ gatk || ! $ chain || ! $ newRef || ! $ oldRef || ! $ out ) {
2010-10-17 10:54:12 +08:00
print "Usage: liftOverVCF.pl\n\t-vcf \t\t<input vcf>\n\t-gatk \t\t<path to gatk trunk>\n\t-chain \t\t<chain file>\n\t-newRef \t<path to new reference prefix; we will need newRef.dict, .fasta, and .fasta.fai>\n\t-oldRef \t<path to old reference prefix; we will need oldRef.fasta>\n\t-out \t\t<output vcf>\n\t-tmp \t\t<temp file location; defaults to /tmp>\n" ;
print "Example: ./liftOverVCF.pl\n\t-vcf /humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/1kg_snp_validation/all_validation_batches.b36.vcf\n\t-chain b36ToHg19.broad.over.chain\n\t-out lifted.hg19.vcf\n\t-gatk /humgen/gsa-scr1/ebanks/Sting_dev\n\t-newRef /seq/references/Homo_sapiens_assembly19/v0/Homo_sapiens_assembly19\n\t-oldRef /humgen/1kg/reference/human_b36_both\n" ;
2010-05-07 20:31:56 +08:00
exit ( 1 ) ;
}
# generate a random number
my $ random_number = rand ( ) ;
my $ tmp_prefix = "$tmp/$random_number" ;
print "Writing temporary files to prefix: $tmp_prefix\n" ;
2010-10-17 10:54:12 +08:00
my $ unsorted_vcf = "$tmp_prefix.unsorted.vcf" ;
2010-05-07 20:31:56 +08:00
# lift over the file
print "Lifting over the vcf..." ;
2010-10-17 10:54:12 +08:00
my $ cmd = "java -jar $gatk/dist/GenomeAnalysisTK.jar -T LiftoverVariants -R $oldRef.fasta -B:variant,vcf $in -o $unsorted_vcf -chain $chain -dict $newRef.dict" ;
2010-05-07 20:31:56 +08:00
system ( $ cmd ) ;
2010-10-17 10:54:12 +08:00
# we need to sort the lifted over file now
print "\nRe-sorting the vcf...\n" ;
my $ sorted_vcf = "$tmp_prefix.sorted.vcf" ;
open ( SORTED , ">$sorted_vcf" ) or die "can't open $sorted_vcf: $!" ;
# write the header
open ( UNSORTED , "< $unsorted_vcf" ) or die "can't open $unsorted_vcf: $!" ;
my $ inHeader = 1 ;
while ( $ inHeader == 1 ) {
my $ line = <UNSORTED> ;
if ( $ line !~ m/^#/ ) {
$ inHeader = 0 ;
} else {
print SORTED "$line" ;
}
}
close ( UNSORTED ) ;
close ( SORTED ) ;
2010-10-18 09:44:54 +08:00
$ cmd = "grep \"^#\" -v $unsorted_vcf | sort -n -k2 -T $tmp | $gatk/perl/sortByRef.pl --tmp $tmp - $newRef.fasta.fai >> $sorted_vcf" ;
system ( $ cmd ) ;
2010-05-07 20:31:56 +08:00
# Filter the VCF for bad records
print "\nFixing/removing bad records...\n" ;
2010-10-17 10:54:12 +08:00
$ cmd = "java -jar $gatk/dist/GenomeAnalysisTK.jar -T FilterLiftedVariants -R $newRef.fasta -B:variant,vcf $sorted_vcf -o $out" ;
2010-05-07 20:31:56 +08:00
system ( $ cmd ) ;
# clean up
2010-10-17 10:54:12 +08:00
unlink $ unsorted_vcf ;
unlink $ sorted_vcf ;
2010-10-17 12:41:42 +08:00
my $ sorted_index = "$sorted_vcf.idx" ;
unlink $ sorted_index ;
2010-05-07 20:31:56 +08:00
print "\nDone!\n" ;