Hmm. Apparently variants can get lifted over to different chromosomes. Who knew? Reverting changes from a couple of days ago. The only way to do this correctly (without requiring lots of memory) is to turn off on-the-fly indexing for this walker. Integration tests cover this now.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4510 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
196029c0b4
commit
7aa030a9a4
|
|
@ -30,15 +30,13 @@ import org.broadinstitute.sting.commandline.Output;
|
|||
import org.broadinstitute.sting.utils.vcf.VCFUtils;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.SampleUtils;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.*;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils;
|
||||
import org.broad.tribble.vcf.VCFHeader;
|
||||
import org.broad.tribble.vcf.SortingVCFWriter;
|
||||
import org.broad.tribble.TribbleException;
|
||||
import org.broad.tribble.vcf.StandardVCFWriter;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.*;
|
||||
|
|
@ -56,7 +54,7 @@ public class LiftoverVariants extends RodWalker<Integer, Integer> {
|
|||
|
||||
@Output(doc="File to which variants should be written",required=true)
|
||||
protected File file = null;
|
||||
protected SortingVCFWriter writer = null;
|
||||
protected StandardVCFWriter writer = null;
|
||||
|
||||
@Argument(fullName="chain", shortName="chain", doc="Chain file", required=true)
|
||||
protected File CHAIN = null;
|
||||
|
|
@ -64,9 +62,6 @@ public class LiftoverVariants extends RodWalker<Integer, Integer> {
|
|||
@Argument(fullName="newSequenceDictionary", shortName="dict", doc="Sequence .dict file for the new build", required=true)
|
||||
protected File NEW_SEQ_DICT = null;
|
||||
|
||||
@Argument(fullName="maxCachingDistance", shortName="cache", doc="Maximum genomic distance for which we will cache variants", required=false)
|
||||
protected Integer maxCachingDistance = 50000;
|
||||
|
||||
private LiftOver liftOver;
|
||||
|
||||
private long successfulIntervals = 0, failedIntervals = 0;
|
||||
|
|
@ -82,8 +77,7 @@ public class LiftoverVariants extends RodWalker<Integer, Integer> {
|
|||
Map<String, VCFHeader> vcfHeaders = VCFUtils.getVCFHeadersFromRods(getToolkit(), Arrays.asList("variant"));
|
||||
|
||||
final VCFHeader vcfHeader = new VCFHeader(vcfHeaders.containsKey("variant") ? vcfHeaders.get("variant").getMetaData() : null, samples);
|
||||
|
||||
writer = new SortingVCFWriter(file, maxCachingDistance);
|
||||
writer = new StandardVCFWriter(file, false);
|
||||
writer.writeHeader(vcfHeader);
|
||||
}
|
||||
|
||||
|
|
@ -95,11 +89,7 @@ public class LiftoverVariants extends RodWalker<Integer, Integer> {
|
|||
|
||||
if ( toInterval != null ) {
|
||||
vc = VariantContextUtils.modifyLocation(vc, GenomeLocParser.createPotentiallyInvalidGenomeLoc(toInterval.getSequence(), toInterval.getStart(), toInterval.getStart() + length));
|
||||
try {
|
||||
writer.add(vc, ref.getBase());
|
||||
} catch (TribbleException e) {
|
||||
throw new UserException(e.getMessage());
|
||||
}
|
||||
writer.add(vc, ref.getBase());
|
||||
successfulIntervals++;
|
||||
} else {
|
||||
failedIntervals++;
|
||||
|
|
@ -122,8 +112,7 @@ public class LiftoverVariants extends RodWalker<Integer, Integer> {
|
|||
public Integer reduce(Integer value, Integer sum) { return 0; }
|
||||
|
||||
public void onTraversalDone(Integer result) {
|
||||
writer.flush();
|
||||
writer.close();
|
||||
System.out.println("Converted " + successfulIntervals + " records; failed to convert " + failedIntervals + " records.");
|
||||
writer.close();
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -49,7 +49,7 @@ public class LiftoverVariantsIntegrationTest extends WalkerTest {
|
|||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
"-T LiftoverVariants -o %s -R " + hg18Reference + " -B:variant,vcf " + validationDataLocation + "liftover_test.vcf -chain " + validationDataLocation + "hg18ToHg19.broad.over.chain -dict /seq/references/Homo_sapiens_assembly19/v0/Homo_sapiens_assembly19.dict",
|
||||
1,
|
||||
Arrays.asList("ce71a6d40f7375826e31adb3832d2fe3"));
|
||||
Arrays.asList("22d4cd78d72da55f2423b9ab06f467b2"));
|
||||
executeTest("test hg18 to hg19, unsorted", spec);
|
||||
}
|
||||
}
|
||||
|
|
@ -11,7 +11,6 @@ my $chain = undef;
|
|||
my $newRef = undef;
|
||||
my $oldRef = undef;
|
||||
my $out = undef;
|
||||
my $cacheSize = 50000;
|
||||
my $tmp = "/tmp";
|
||||
GetOptions( "vcf=s" => \$in,
|
||||
"gatk=s" => \$gatk,
|
||||
|
|
@ -19,12 +18,11 @@ GetOptions( "vcf=s" => \$in,
|
|||
"newRef=s" => \$newRef,
|
||||
"oldRef=s" => \$oldRef,
|
||||
"out=s" => \$out,
|
||||
"cache=s" => \$cacheSize,
|
||||
"tmp=s" => \$tmp);
|
||||
|
||||
if ( !$in || !$gatk || !$chain || !$newRef || !$oldRef || !$out ) {
|
||||
print "Usage: liftOverVCF.pl\n\t-vcf \t\t<input vcf>\n\t-gatk \t\t<path to gatk trunk>\n\t-chain \t\t<chain file>\n\t-newRef \t<path to new reference prefix; we will need newRef.dict, .fasta, and .fasta.fai>\n\t-oldRef \t<path to old reference prefix; we will need oldRef.fasta>\n\t-out \t\t<output vcf>\n\t-cache \t\t<cache size used for on-the-fly sorting; defaults to 50000>\n\t-tmp \t\t<temp file location; defaults to /tmp>\n";
|
||||
print "Example: ./liftOverVCF.pl\n\t-vcf /humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/1kg_snp_validation/all_validation_batches.b36.vcf\n\t-chain b36ToHg19.broad.over.chain\n\t-out lifted.hg19.vcf\n\t-gatk /humgen/gsa-scr1/ebanks/Sting_dev\n\t-newRef /seq/references/Homo_sapiens_assembly19/v0/Homo_sapiens_assembly19\n\t-oldRef /humgen/1kg/reference/human_b36_both\n\t-cache 60000\n";
|
||||
print "Usage: liftOverVCF.pl\n\t-vcf \t\t<input vcf>\n\t-gatk \t\t<path to gatk trunk>\n\t-chain \t\t<chain file>\n\t-newRef \t<path to new reference prefix; we will need newRef.dict, .fasta, and .fasta.fai>\n\t-oldRef \t<path to old reference prefix; we will need oldRef.fasta>\n\t-out \t\t<output vcf>\n\t-tmp \t\t<temp file location; defaults to /tmp>\n";
|
||||
print "Example: ./liftOverVCF.pl\n\t-vcf /humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/1kg_snp_validation/all_validation_batches.b36.vcf\n\t-chain b36ToHg19.broad.over.chain\n\t-out lifted.hg19.vcf\n\t-gatk /humgen/gsa-scr1/ebanks/Sting_dev\n\t-newRef /seq/references/Homo_sapiens_assembly19/v0/Homo_sapiens_assembly19\n\t-oldRef /humgen/1kg/reference/human_b36_both\n";
|
||||
exit(1);
|
||||
}
|
||||
|
||||
|
|
@ -32,19 +30,42 @@ if ( !$in || !$gatk || !$chain || !$newRef || !$oldRef || !$out ) {
|
|||
my $random_number = rand();
|
||||
my $tmp_prefix = "$tmp/$random_number";
|
||||
print "Writing temporary files to prefix: $tmp_prefix\n";
|
||||
my $lifted_vcf = "$tmp_prefix.lifted.vcf";
|
||||
my $unsorted_vcf = "$tmp_prefix.unsorted.vcf";
|
||||
|
||||
# lift over the file
|
||||
print "Lifting over the vcf...";
|
||||
my $cmd = "java -jar $gatk/dist/GenomeAnalysisTK.jar -T LiftoverVariants -R $oldRef.fasta -B:variant,vcf $in -o $lifted_vcf -chain $chain -dict $newRef.dict -cache $cacheSize";
|
||||
my $cmd = "java -jar $gatk/dist/GenomeAnalysisTK.jar -T LiftoverVariants -R $oldRef.fasta -B:variant,vcf $in -o $unsorted_vcf -chain $chain -dict $newRef.dict";
|
||||
system($cmd);
|
||||
|
||||
# we need to sort the lifted over file now
|
||||
print "\nRe-sorting the vcf...\n";
|
||||
my $sorted_vcf = "$tmp_prefix.sorted.vcf";
|
||||
open(SORTED, ">$sorted_vcf") or die "can't open $sorted_vcf: $!";
|
||||
|
||||
# write the header
|
||||
open(UNSORTED, "< $unsorted_vcf") or die "can't open $unsorted_vcf: $!";
|
||||
my $inHeader = 1;
|
||||
while ( $inHeader == 1 ) {
|
||||
my $line = <UNSORTED>;
|
||||
if ( $line !~ m/^#/ ) {
|
||||
$inHeader = 0;
|
||||
} else {
|
||||
print SORTED "$line";
|
||||
}
|
||||
}
|
||||
close(UNSORTED);
|
||||
|
||||
$cmd = "grep \"^#\" -v $unsorted_vcf | sort -n -k2 -T $tmp | $gatk/perl/sortByRef.pl --tmp $tmp - $newRef.fasta.fai";
|
||||
print SORTED `$cmd`;
|
||||
close(SORTED);
|
||||
|
||||
# Filter the VCF for bad records
|
||||
print "\nFixing/removing bad records...\n";
|
||||
$cmd = "java -jar $gatk/dist/GenomeAnalysisTK.jar -T FilterLiftedVariants -R $newRef.fasta -B:variant,vcf $lifted_vcf -o $out";
|
||||
$cmd = "java -jar $gatk/dist/GenomeAnalysisTK.jar -T FilterLiftedVariants -R $newRef.fasta -B:variant,vcf $sorted_vcf -o $out";
|
||||
system($cmd);
|
||||
|
||||
# clean up
|
||||
unlink $lifted_vcf;
|
||||
unlink $unsorted_vcf;
|
||||
unlink $sorted_vcf;
|
||||
|
||||
print "\nDone!\n";
|
||||
|
|
|
|||
Loading…
Reference in New Issue