Checking in the liftover script. I am including a post-processing walker to filter out bad records written in under 10 minutes as per my agreement with Mark.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3321 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
18f1d31a22
commit
44de92e09d
|
|
@ -0,0 +1,97 @@
|
|||
/*
|
||||
* Copyright (c) 2010.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.vcf;
|
||||
|
||||
import org.broadinstitute.sting.utils.genotype.vcf.VCFWriter;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.*;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broad.tribble.vcf.VCFRecord;
|
||||
import org.broad.tribble.vcf.VCFCodec;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Filters a lifted-over VCF file for ref bases that have been changed.
|
||||
*/
|
||||
@Requires(value={},referenceMetaData=@RMD(name="vcf",type= VCFCodec.class))
|
||||
public class FilterLiftedVCF extends RodWalker<Integer, Integer> {
|
||||
|
||||
private VCFWriter writer;
|
||||
|
||||
private long failedLocs = 0, totalLocs = 0;
|
||||
|
||||
public void initialize() {}
|
||||
|
||||
private void filterAndWrite(char ref, VCFRecord record) {
|
||||
|
||||
totalLocs++;
|
||||
|
||||
char recordRef = record.getReference().charAt(0);
|
||||
|
||||
if ( recordRef != ref ) {
|
||||
|
||||
// is it reverse complemented?
|
||||
if ( BaseUtils.simpleComplement(recordRef) == ref ) {
|
||||
record.setFilterString(record.isFiltered() ? String.format("%s;LiftoverToReverseComplementRefBase", record.getFilterString()) : "LiftoverToNewBase");
|
||||
failedLocs++;
|
||||
} else {
|
||||
record.setFilterString(record.isFiltered() ? String.format("%s;LiftoverToDifferentRefBase", record.getFilterString()) : "LiftoverToNewBase");
|
||||
failedLocs++;
|
||||
}
|
||||
}
|
||||
|
||||
if ( writer == null ) {
|
||||
writer = new VCFWriter(out);
|
||||
writer.writeHeader(record.getHeader());
|
||||
}
|
||||
writer.addRecord(record);
|
||||
}
|
||||
|
||||
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
if ( tracker == null )
|
||||
return 0;
|
||||
|
||||
List<Object> rods = tracker.getReferenceMetaData("vcf");
|
||||
|
||||
for ( Object rod : rods )
|
||||
filterAndWrite(ref.getBase(), (VCFRecord)rod);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
public Integer reduceInit() { return 0; }
|
||||
|
||||
public Integer reduce(Integer value, Integer sum) { return 0; }
|
||||
|
||||
public void onTraversalDone(Integer result) {
|
||||
if ( writer != null )
|
||||
writer.close();
|
||||
System.out.println("Filtered " + failedLocs + " records out of " + totalLocs + " total records.");
|
||||
}
|
||||
}
|
||||
|
|
@ -103,7 +103,8 @@ public class LiftoverVCF extends RodWalker<Integer, Integer> {
|
|||
public Integer reduce(Integer value, Integer sum) { return 0; }
|
||||
|
||||
public void onTraversalDone(Integer result) {
|
||||
writer.close();
|
||||
System.out.println("Converted " + successfulIntervals + " intervals; failed to convert " + failedIntervals + " intervals.");
|
||||
if ( writer != null )
|
||||
writer.close();
|
||||
System.out.println("Converted " + successfulIntervals + " records; failed to convert " + failedIntervals + " records.");
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,71 @@
|
|||
#!/usr/bin/perl -w
|
||||
|
||||
# Runs the liftover tool on a VCF and properly handles the output
|
||||
|
||||
use strict;
|
||||
use Getopt::Long;
|
||||
|
||||
my $in = undef;
|
||||
my $gatk = undef;
|
||||
my $chain = undef;
|
||||
my $newRef = undef;
|
||||
my $oldRef = undef;
|
||||
my $out = undef;
|
||||
my $tmp = "/tmp";
|
||||
GetOptions( "vcf=s" => \$in,
|
||||
"gatk=s" => \$gatk,
|
||||
"chain=s" => \$chain,
|
||||
"newRef=s" => \$newRef,
|
||||
"oldRef=s" => \$oldRef,
|
||||
"out=s" => \$out,
|
||||
"tmp=s" => \$tmp);
|
||||
|
||||
if ( !$in || !$gatk || !$chain || !$newRef || !$oldRef || !$out ) {
|
||||
print "Usage: liftOverVCF.pl\n\t-vcf \t\t<input vcf>\n\t-gatk \t\t<path to gatk trunk>\n\t-chain \t\t<chain file>\n\t-newRef \t<path to new reference prefix; we will need newRef.dict, .fasta, and .fasta.fai>\n\t-oldRef \t<path to old reference prefix; we will need oldRef.fasta>\n\t-out \t\t<output vcf>\n\t-tmp <temp file location; defaults to /tmp>";
|
||||
print "Example: ./liftOverVCF.pl\n\t-vcf /humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/1kg_snp_validation/all_validation_batches.b36.vcf\n\t-chain b36ToHg19.broad.over.chain\n\t-out lifted.hg19.vcf\n\t-gatk /humgen/gsa-scr1/ebanks/Sting_dev\n\t-newRef /seq/references/Homo_sapiens_assembly19/v0/Homo_sapiens_assembly19\n\t-oldRef /broad/1KG/reference/human_b36_both\n";
|
||||
exit(1);
|
||||
}
|
||||
|
||||
# generate a random number
|
||||
my $random_number = rand();
|
||||
my $tmp_prefix = "$tmp/$random_number";
|
||||
print "Writing temporary files to prefix: $tmp_prefix\n";
|
||||
my $unsorted_vcf = "$tmp_prefix.unsorted.vcf";
|
||||
|
||||
# lift over the file
|
||||
print "Lifting over the vcf...";
|
||||
my $cmd = "java -jar $gatk/dist/GenomeAnalysisTK.jar -T LiftoverVCF -R $oldRef.fasta -B vcf,vcf,$in -o $unsorted_vcf -chain $chain -dict $newRef.dict";
|
||||
system($cmd);
|
||||
|
||||
# we need to sort the lifted over file now
|
||||
print "\nRe-sorting the vcf...\n";
|
||||
my $sorted_vcf = "$tmp_prefix.sorted.vcf";
|
||||
open(SORTED, ">$sorted_vcf") or die "can't open $sorted_vcf: $!";
|
||||
|
||||
# write the header
|
||||
open(UNSORTED, "< $unsorted_vcf") or die "can't open $unsorted_vcf: $!";
|
||||
my $inHeader = 1;
|
||||
while ( $inHeader == 1 ) {
|
||||
my $line = <UNSORTED>;
|
||||
if ( $line !~ m/^#/ ) {
|
||||
$inHeader = 0;
|
||||
} else {
|
||||
print SORTED "$line";
|
||||
}
|
||||
}
|
||||
close(UNSORTED);
|
||||
|
||||
$cmd = "grep \"^#\" -v $unsorted_vcf | sort -n +1 | $gatk/perl/sortByRef.pl - $newRef.fasta.fai";
|
||||
print SORTED `$cmd`;
|
||||
close(SORTED);
|
||||
|
||||
# Filter the VCF for bad records
|
||||
print "\nFixing/removing bad records...\n";
|
||||
$cmd = "java -jar $gatk/dist/GenomeAnalysisTK.jar -T FilterLiftedVCF -R $newRef.fasta -B vcf,vcf,$sorted_vcf -o $out";
|
||||
system($cmd);
|
||||
|
||||
# clean up
|
||||
unlink $unsorted_vcf;
|
||||
unlink $sorted_vcf;
|
||||
|
||||
print "\nDone!\n";
|
||||
Loading…
Reference in New Issue