Hmm. Apparently variants can get lifted over to different chromosomes. Who knew? Reverting changes from a couple of days ago. The only way to do this correctly (without requiring lots of memory) is to turn off on-the-fly indexing for this walker. Integration tests cover this now.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4510 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
ebanks 2010-10-17 02:54:12 +00:00
parent 196029c0b4
commit 7aa030a9a4
3 changed files with 35 additions and 25 deletions

View File

@ -30,15 +30,13 @@ import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.utils.vcf.VCFUtils;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.SampleUtils;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils;
import org.broad.tribble.vcf.VCFHeader;
import org.broad.tribble.vcf.SortingVCFWriter;
import org.broad.tribble.TribbleException;
import org.broad.tribble.vcf.StandardVCFWriter;
import java.io.File;
import java.util.*;
@ -56,7 +54,7 @@ public class LiftoverVariants extends RodWalker<Integer, Integer> {
@Output(doc="File to which variants should be written",required=true)
protected File file = null;
protected SortingVCFWriter writer = null;
protected StandardVCFWriter writer = null;
@Argument(fullName="chain", shortName="chain", doc="Chain file", required=true)
protected File CHAIN = null;
@ -64,9 +62,6 @@ public class LiftoverVariants extends RodWalker<Integer, Integer> {
@Argument(fullName="newSequenceDictionary", shortName="dict", doc="Sequence .dict file for the new build", required=true)
protected File NEW_SEQ_DICT = null;
@Argument(fullName="maxCachingDistance", shortName="cache", doc="Maximum genomic distance for which we will cache variants", required=false)
protected Integer maxCachingDistance = 50000;
private LiftOver liftOver;
private long successfulIntervals = 0, failedIntervals = 0;
@ -82,8 +77,7 @@ public class LiftoverVariants extends RodWalker<Integer, Integer> {
Map<String, VCFHeader> vcfHeaders = VCFUtils.getVCFHeadersFromRods(getToolkit(), Arrays.asList("variant"));
final VCFHeader vcfHeader = new VCFHeader(vcfHeaders.containsKey("variant") ? vcfHeaders.get("variant").getMetaData() : null, samples);
writer = new SortingVCFWriter(file, maxCachingDistance);
writer = new StandardVCFWriter(file, false);
writer.writeHeader(vcfHeader);
}
@ -95,11 +89,7 @@ public class LiftoverVariants extends RodWalker<Integer, Integer> {
if ( toInterval != null ) {
vc = VariantContextUtils.modifyLocation(vc, GenomeLocParser.createPotentiallyInvalidGenomeLoc(toInterval.getSequence(), toInterval.getStart(), toInterval.getStart() + length));
try {
writer.add(vc, ref.getBase());
} catch (TribbleException e) {
throw new UserException(e.getMessage());
}
writer.add(vc, ref.getBase());
successfulIntervals++;
} else {
failedIntervals++;
@ -122,8 +112,7 @@ public class LiftoverVariants extends RodWalker<Integer, Integer> {
public Integer reduce(Integer value, Integer sum) { return 0; }
public void onTraversalDone(Integer result) {
writer.flush();
writer.close();
System.out.println("Converted " + successfulIntervals + " records; failed to convert " + failedIntervals + " records.");
writer.close();
}
}

View File

@ -49,7 +49,7 @@ public class LiftoverVariantsIntegrationTest extends WalkerTest {
WalkerTestSpec spec = new WalkerTestSpec(
"-T LiftoverVariants -o %s -R " + hg18Reference + " -B:variant,vcf " + validationDataLocation + "liftover_test.vcf -chain " + validationDataLocation + "hg18ToHg19.broad.over.chain -dict /seq/references/Homo_sapiens_assembly19/v0/Homo_sapiens_assembly19.dict",
1,
Arrays.asList("ce71a6d40f7375826e31adb3832d2fe3"));
Arrays.asList("22d4cd78d72da55f2423b9ab06f467b2"));
executeTest("test hg18 to hg19, unsorted", spec);
}
}

View File

@ -11,7 +11,6 @@ my $chain = undef;
my $newRef = undef;
my $oldRef = undef;
my $out = undef;
my $cacheSize = 50000;
my $tmp = "/tmp";
GetOptions( "vcf=s" => \$in,
"gatk=s" => \$gatk,
@ -19,12 +18,11 @@ GetOptions( "vcf=s" => \$in,
"newRef=s" => \$newRef,
"oldRef=s" => \$oldRef,
"out=s" => \$out,
"cache=s" => \$cacheSize,
"tmp=s" => \$tmp);
if ( !$in || !$gatk || !$chain || !$newRef || !$oldRef || !$out ) {
print "Usage: liftOverVCF.pl\n\t-vcf \t\t<input vcf>\n\t-gatk \t\t<path to gatk trunk>\n\t-chain \t\t<chain file>\n\t-newRef \t<path to new reference prefix; we will need newRef.dict, .fasta, and .fasta.fai>\n\t-oldRef \t<path to old reference prefix; we will need oldRef.fasta>\n\t-out \t\t<output vcf>\n\t-cache \t\t<cache size used for on-the-fly sorting; defaults to 50000>\n\t-tmp \t\t<temp file location; defaults to /tmp>\n";
print "Example: ./liftOverVCF.pl\n\t-vcf /humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/1kg_snp_validation/all_validation_batches.b36.vcf\n\t-chain b36ToHg19.broad.over.chain\n\t-out lifted.hg19.vcf\n\t-gatk /humgen/gsa-scr1/ebanks/Sting_dev\n\t-newRef /seq/references/Homo_sapiens_assembly19/v0/Homo_sapiens_assembly19\n\t-oldRef /humgen/1kg/reference/human_b36_both\n\t-cache 60000\n";
print "Usage: liftOverVCF.pl\n\t-vcf \t\t<input vcf>\n\t-gatk \t\t<path to gatk trunk>\n\t-chain \t\t<chain file>\n\t-newRef \t<path to new reference prefix; we will need newRef.dict, .fasta, and .fasta.fai>\n\t-oldRef \t<path to old reference prefix; we will need oldRef.fasta>\n\t-out \t\t<output vcf>\n\t-tmp \t\t<temp file location; defaults to /tmp>\n";
print "Example: ./liftOverVCF.pl\n\t-vcf /humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/1kg_snp_validation/all_validation_batches.b36.vcf\n\t-chain b36ToHg19.broad.over.chain\n\t-out lifted.hg19.vcf\n\t-gatk /humgen/gsa-scr1/ebanks/Sting_dev\n\t-newRef /seq/references/Homo_sapiens_assembly19/v0/Homo_sapiens_assembly19\n\t-oldRef /humgen/1kg/reference/human_b36_both\n";
exit(1);
}
@ -32,19 +30,42 @@ if ( !$in || !$gatk || !$chain || !$newRef || !$oldRef || !$out ) {
my $random_number = rand();
my $tmp_prefix = "$tmp/$random_number";
print "Writing temporary files to prefix: $tmp_prefix\n";
my $lifted_vcf = "$tmp_prefix.lifted.vcf";
my $unsorted_vcf = "$tmp_prefix.unsorted.vcf";
# lift over the file
print "Lifting over the vcf...";
my $cmd = "java -jar $gatk/dist/GenomeAnalysisTK.jar -T LiftoverVariants -R $oldRef.fasta -B:variant,vcf $in -o $lifted_vcf -chain $chain -dict $newRef.dict -cache $cacheSize";
my $cmd = "java -jar $gatk/dist/GenomeAnalysisTK.jar -T LiftoverVariants -R $oldRef.fasta -B:variant,vcf $in -o $unsorted_vcf -chain $chain -dict $newRef.dict";
system($cmd);
# we need to sort the lifted over file now
print "\nRe-sorting the vcf...\n";
my $sorted_vcf = "$tmp_prefix.sorted.vcf";
open(SORTED, ">$sorted_vcf") or die "can't open $sorted_vcf: $!";
# write the header
open(UNSORTED, "< $unsorted_vcf") or die "can't open $unsorted_vcf: $!";
my $inHeader = 1;
while ( $inHeader == 1 ) {
my $line = <UNSORTED>;
if ( $line !~ m/^#/ ) {
$inHeader = 0;
} else {
print SORTED "$line";
}
}
close(UNSORTED);
$cmd = "grep \"^#\" -v $unsorted_vcf | sort -n -k2 -T $tmp | $gatk/perl/sortByRef.pl --tmp $tmp - $newRef.fasta.fai";
print SORTED `$cmd`;
close(SORTED);
# Filter the VCF for bad records
print "\nFixing/removing bad records...\n";
$cmd = "java -jar $gatk/dist/GenomeAnalysisTK.jar -T FilterLiftedVariants -R $newRef.fasta -B:variant,vcf $lifted_vcf -o $out";
$cmd = "java -jar $gatk/dist/GenomeAnalysisTK.jar -T FilterLiftedVariants -R $newRef.fasta -B:variant,vcf $sorted_vcf -o $out";
system($cmd);
# clean up
unlink $lifted_vcf;
unlink $unsorted_vcf;
unlink $sorted_vcf;
print "\nDone!\n";