From b6bdd6128310a6c0d12a466e6b3e0a496bf6be3e Mon Sep 17 00:00:00 2001 From: delangel Date: Fri, 2 Jul 2010 15:19:42 +0000 Subject: [PATCH] a) Fix bug when multi-base reference is homopolymeric when writing a VCF4.0 variant context: computation of number of trailing bases was incorrect and we ended up with incorrect position. b) Updated VCF4WriterTestWalker to take either VCF3 or VCF4 as inputs (this walker can also be used to convert from 3.3 to 4.0). git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3711 348d0f76-0448-11de-a6fe-93d51630548a --- .../walkers/VCF4WriterTestWalker.java | 45 ++++++++++++------- .../sting/utils/genotype/vcf/VCFWriter.java | 2 +- 2 files changed, 31 insertions(+), 16 deletions(-) diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/VCF4WriterTestWalker.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/VCF4WriterTestWalker.java index b2eef66d4..4895cde37 100755 --- a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/VCF4WriterTestWalker.java +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/VCF4WriterTestWalker.java @@ -1,5 +1,6 @@ package org.broadinstitute.sting.oneoffprojects.walkers; +import org.broad.tribble.FeatureCodec; import org.broad.tribble.util.AsciiLineReader; import org.broad.tribble.vcf.*; import org.broadinstitute.sting.commandline.Argument; @@ -74,6 +75,7 @@ public class VCF4WriterTestWalker extends RodWalker { final TreeSet samples = new TreeSet(); VCF4Codec vcf4codec = new VCF4Codec(); + /** * Initialize the number of loci processed to zero. * @@ -84,30 +86,43 @@ public class VCF4WriterTestWalker extends RodWalker { public void initialize() { final List dataSources = this.getToolkit().getRodDataSources(); - // Open output file specified by output VCF ROD - - vcfWriter = new VCFWriter(new File(OUTPUT_FILE), true); + // Open output file specified by output VCF ROD + Set hInfo = new HashSet(); + hInfo.addAll(VCFUtils.getHeaderFields(getToolkit())); + + + vcfWriter = new VCFWriter(new File(OUTPUT_FILE), true); + VCFHeader header = null; for( final ReferenceOrderedDataSource source : dataSources ) { final RMDTrack rod = source.getReferenceOrderedData(); - Object p = rod.getRecordType(); - if(rod.getType().equals(vcf4codec.getClass()) && rod.getName().equalsIgnoreCase(INPUT_ROD_NAME)) { - try { - AsciiLineReader lineReader = new AsciiLineReader(new FileInputStream(rod.getFile().getAbsolutePath())); - int lineNumber = vcf4codec.readHeader(lineReader); - out.printf("Read %d header lines%n", lineNumber); + if(rod.getName().equalsIgnoreCase(INPUT_ROD_NAME)) { + if (rod.getType().equals(vcf4codec.getClass())) { - VCFHeader header = vcf4codec.getHeader(VCFHeader.class); - vcfWriter.writeHeader(header); - } - catch (FileNotFoundException e ) { - throw new StingException(e.getMessage()); + try { + AsciiLineReader lineReader = new AsciiLineReader(new FileInputStream(rod.getFile().getAbsolutePath())); + int lineNumber = vcf4codec.readHeader(lineReader); + out.printf("Read %d header lines%n", lineNumber); + + header = vcf4codec.getHeader(VCFHeader.class); + } + catch (FileNotFoundException e ) { + throw new StingException(e.getMessage()); + } + } else { + final VCFReader reader = new VCFReader(rod.getFile()); + final Set vcfSamples = reader.getHeader().getGenotypeSamples(); + samples.addAll(vcfSamples); + reader.close(); + header = new VCFHeader(hInfo, samples); } + } - } + header.setVersion(VCFHeaderVersion.VCF4_0); + vcfWriter.writeHeader(header); } diff --git a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFWriter.java b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFWriter.java index 9e824cce6..f96e49752 100644 --- a/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFWriter.java +++ b/java/src/org/broadinstitute/sting/utils/genotype/vcf/VCFWriter.java @@ -279,7 +279,7 @@ public class VCFWriter { // scan through all alleles and look for maximum common subsequence at beginning and at end for all alleles numTrailingBases = 1; } else - numTrailingBases = originalRef.indexOf(refString); + numTrailingBases = originalRef.lastIndexOf(refString); numPaddingBases = originalRef.length()-refString.length()-numTrailingBases; if (numTrailingBases > 0) {