From 38a67fed6366d2d75e3bab3e2367d78aee8555bf Mon Sep 17 00:00:00 2001 From: depristo Date: Fri, 8 Oct 2010 19:53:21 +0000 Subject: [PATCH] High performance version of standard vcf writer. New general static Tribble class for common constants, including general .idx constant and functions to get standard index name for a given file. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4471 348d0f76-0448-11de-a6fe-93d51630548a --- .../sting/gatk/io/storage/StorageFactory.java | 2 + .../gatk/io/storage/VCFWriterStorage.java | 86 +++++++++++-------- .../tracks/builders/RMDTrackBuilder.java | 11 +-- .../org/broadinstitute/sting/WalkerTest.java | 19 ++++ .../builders/RMDTrackBuilderUnitTest.java | 19 ++-- .../utils/genotype/vcf/VCFWriterUnitTest.java | 3 +- 6 files changed, 86 insertions(+), 54 deletions(-) diff --git a/java/src/org/broadinstitute/sting/gatk/io/storage/StorageFactory.java b/java/src/org/broadinstitute/sting/gatk/io/storage/StorageFactory.java index ee5c56524..6e2aefca5 100644 --- a/java/src/org/broadinstitute/sting/gatk/io/storage/StorageFactory.java +++ b/java/src/org/broadinstitute/sting/gatk/io/storage/StorageFactory.java @@ -25,6 +25,8 @@ package org.broadinstitute.sting.gatk.io.storage; +import org.broadinstitute.sting.gatk.AbstractGenomeAnalysisEngine; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.io.stubs.Stub; import org.broadinstitute.sting.gatk.io.stubs.OutputStreamStub; import org.broadinstitute.sting.gatk.io.stubs.SAMFileWriterStub; diff --git a/java/src/org/broadinstitute/sting/gatk/io/storage/VCFWriterStorage.java b/java/src/org/broadinstitute/sting/gatk/io/storage/VCFWriterStorage.java index ac8740289..d2ef9d6a9 100644 --- a/java/src/org/broadinstitute/sting/gatk/io/storage/VCFWriterStorage.java +++ b/java/src/org/broadinstitute/sting/gatk/io/storage/VCFWriterStorage.java @@ -1,10 +1,9 @@ package org.broadinstitute.sting.gatk.io.storage; -import org.broad.tribble.vcf.StandardVCFWriter; -import org.broad.tribble.vcf.VCFHeader; -import org.broad.tribble.vcf.VCFHeaderLine; +import org.broad.tribble.readers.LineReader; +import org.broad.tribble.source.BasicFeatureSource; +import org.broad.tribble.vcf.*; import org.broad.tribble.util.variantcontext.VariantContext; -import org.broad.tribble.vcf.VCFWriter; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.gatk.io.stubs.VCFWriterStub; @@ -12,6 +11,7 @@ import java.io.*; import net.sf.samtools.util.BlockCompressedOutputStream; import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.text.XReadLines; /** * Provides temporary and permanent storage for genotypes in VCF format. @@ -21,7 +21,7 @@ import org.broadinstitute.sting.utils.exceptions.UserException; */ public class VCFWriterStorage implements Storage, VCFWriter { protected final File file; - protected final OutputStream stream; + protected OutputStream stream; protected final VCFWriter writer; /** @@ -30,29 +30,40 @@ public class VCFWriterStorage implements Storage, VCFWriter { * @param stub Stub to use when constructing the output file. */ public VCFWriterStorage( VCFWriterStub stub ) { - if ( stub.getFile() != null ) { - file = stub.getFile(); - try { - if ( stub.isCompressed() ) - stream = new BlockCompressedOutputStream(file); - else - stream = new PrintStream(file); - } - catch(IOException ex) { - throw new UserException.CouldNotCreateOutputFile(file, "Unable to open target output stream", ex); - } + this.file = stub.getFile(); + writer = VCFWriterToFile(stub, stub.getFile()); } else if ( stub.getOutputStream() != null ) { this.file = null; this.stream = stub.getOutputStream(); + writer = new StandardVCFWriter(stream); } else throw new ReviewedStingException("Unable to create target to which to write; storage was provided with neither a file nor a stream."); - - writer = new StandardVCFWriter(stream); } + /** + * common initialization routine for multiple constructors + * @param stub + * @param file + * @return A VCF writer for use with this class + */ + private StandardVCFWriter VCFWriterToFile(VCFWriterStub stub, File file) { + try { + if ( stub.isCompressed() ) + stream = new BlockCompressedOutputStream(file); + else + stream = new PrintStream(file); + } + catch(IOException ex) { + throw new UserException.CouldNotCreateOutputFile(file, "Unable to open target output stream", ex); + } + + return new StandardVCFWriter(file, this.stream); + } + + /** * Constructs an object which will redirect into a different file. * @param stub Stub to use when synthesizing file / header info. @@ -60,13 +71,7 @@ public class VCFWriterStorage implements Storage, VCFWriter { */ public VCFWriterStorage(VCFWriterStub stub, File file) { this.file = file; - try { - this.stream = new PrintStream(file); - } - catch(IOException ex) { - throw new UserException.CouldNotCreateOutputFile(file, "Unable to open target output stream",ex); - } - writer = new StandardVCFWriter(this.stream); + this.writer = VCFWriterToFile(stub, file); writer.writeHeader(stub.getVCFHeader()); } @@ -94,20 +99,33 @@ public class VCFWriterStorage implements Storage, VCFWriter { * Merges the stream backing up this temporary storage into the target. * @param target Target stream for the temporary storage. May not be null. */ +// public void mergeInto(VCFWriterStorage target) { +// PrintStream formattingTarget = new PrintStream(target.stream); +// try { +// BufferedReader reader = new BufferedReader(new FileReader(file)); +// String line = reader.readLine(); +// while ( line != null ) { +// if (!VCFHeaderLine.isHeaderLine(line)) +// formattingTarget.printf("%s%n",line); +// line = reader.readLine(); +// } +// +// reader.close(); +// } catch (IOException e) { +// throw new UserException.CouldNotReadInputFile(file, "Error reading file in VCFWriterStorage: ", e); +// } +// } public void mergeInto(VCFWriterStorage target) { - PrintStream formattingTarget = new PrintStream(target.stream); try { - BufferedReader reader = new BufferedReader(new FileReader(file)); - String line = reader.readLine(); - while ( line != null ) { - if (!VCFHeaderLine.isHeaderLine(line)) - formattingTarget.printf("%s%n",line); - line = reader.readLine(); + BasicFeatureSource source = BasicFeatureSource.getFeatureSource(file.getAbsolutePath(), new VCFCodec()); + + for ( VariantContext vc : source.iterator() ) { + target.writer.add(vc, vc.getReferenceBaseForIndel()); } - reader.close(); + source.close(); } catch (IOException e) { throw new UserException.CouldNotReadInputFile(file, "Error reading file in VCFWriterStorage: ", e); } - } + } } diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/tracks/builders/RMDTrackBuilder.java b/java/src/org/broadinstitute/sting/gatk/refdata/tracks/builders/RMDTrackBuilder.java index c4e139ca8..bc8f025d6 100644 --- a/java/src/org/broadinstitute/sting/gatk/refdata/tracks/builders/RMDTrackBuilder.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/tracks/builders/RMDTrackBuilder.java @@ -69,9 +69,6 @@ public class RMDTrackBuilder extends PluginManager { // the input strings we use to create RODs from private final List inputs = new ArrayList(); - // the linear index extension - public static final String indexExtension = ".idx"; - private Map classes = null; // private sequence dictionary we use to set our tracks with @@ -201,7 +198,7 @@ public class RMDTrackBuilder extends PluginManager { // if we don't have a dictionary in the Tribble file, and we've set a dictionary for this builder, set it in the file if they match if (dictFromIndex.size() == 0 && dict != null) { - File indexFile = indexFileForFile(inputFile); + File indexFile = Tribble.indexFile(inputFile); setIndexSequenceDictionary(index,dict,indexFile,true); dictFromIndex = getSequenceDictionaryFromProperties(index); } @@ -218,10 +215,6 @@ public class RMDTrackBuilder extends PluginManager { return reader; } - public static File indexFileForFile(File inputFile) { - return new File(inputFile.getAbsoluteFile() + indexExtension); - } - /** * create an index for the input file * @param inputFile the input file @@ -231,7 +224,7 @@ public class RMDTrackBuilder extends PluginManager { */ public synchronized static Index loadIndex(File inputFile, FeatureCodec codec) throws IOException { // create the index file name, locking on the index file name - File indexFile = indexFileForFile(inputFile); + File indexFile = Tribble.indexFile(inputFile); FSLockWithShared lock = new FSLockWithShared(indexFile); // acquire a lock on the file diff --git a/java/test/org/broadinstitute/sting/WalkerTest.java b/java/test/org/broadinstitute/sting/WalkerTest.java index 6f8866f97..68956db0d 100755 --- a/java/test/org/broadinstitute/sting/WalkerTest.java +++ b/java/test/org/broadinstitute/sting/WalkerTest.java @@ -26,12 +26,16 @@ package org.broadinstitute.sting; import junit.framework.Assert; +import org.broad.tribble.Tribble; +import org.broad.tribble.index.IndexFactory; +import org.broad.tribble.vcf.VCFCodec; import org.broadinstitute.sting.gatk.CommandLineExecutable; import org.broadinstitute.sting.gatk.CommandLineGATK; import org.broadinstitute.sting.utils.GenomeLocParserTestUtils; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.exceptions.StingException; import org.junit.Test; import org.apache.commons.io.FileUtils; @@ -139,10 +143,25 @@ public class WalkerTest extends BaseTest { } } + public void maybeValidateSupplementaryFile(final String name, final File resultFile) { + File indexFile = Tribble.indexFile(resultFile); + //System.out.println("Putative index file is " + indexFile); + if ( indexFile.exists() ) { + if ( resultFile.getAbsolutePath().contains(".vcf") ) { + // todo -- currently we only understand VCF files! Blow up since we can't test them + throw new StingException("Found an index created for file " + resultFile + " but we can only validate VCF files. Extend this code!"); + } + + System.out.println("Verifying on-the-fly index " + indexFile + " for test " + name + " using file " + resultFile); + Assert.assertTrue(IndexFactory.onDiskIndexEqualToNewlyCreatedIndex(resultFile, indexFile, new VCFCodec())); + } + } + public List assertMatchingMD5s(final String name, List resultFiles, List expectedMD5s) { List md5s = new ArrayList(); for (int i = 0; i < resultFiles.size(); i++) { String md5 = assertMatchingMD5(name, resultFiles.get(i), expectedMD5s.get(i)); + maybeValidateSupplementaryFile(name, resultFiles.get(i)); md5s.add(i, md5); } diff --git a/java/test/org/broadinstitute/sting/gatk/refdata/tracks/builders/RMDTrackBuilderUnitTest.java b/java/test/org/broadinstitute/sting/gatk/refdata/tracks/builders/RMDTrackBuilderUnitTest.java index 86de4979f..f44a3f9af 100644 --- a/java/test/org/broadinstitute/sting/gatk/refdata/tracks/builders/RMDTrackBuilderUnitTest.java +++ b/java/test/org/broadinstitute/sting/gatk/refdata/tracks/builders/RMDTrackBuilderUnitTest.java @@ -27,6 +27,7 @@ package org.broadinstitute.sting.gatk.refdata.tracks.builders; import net.sf.picard.reference.IndexedFastaSequenceFile; import net.sf.samtools.SAMSequenceDictionary; import net.sf.samtools.SAMSequenceRecord; +import org.broad.tribble.Tribble; import org.broad.tribble.index.Index; import org.broad.tribble.vcf.VCFCodec; import org.broadinstitute.sting.BaseTest; @@ -77,7 +78,7 @@ public class RMDTrackBuilderUnitTest extends BaseTest { } // make sure we didn't write the file (check that it's timestamp is within bounds) //System.err.println(new File(vcfFile + RMDTrackBuilder.indexExtension).lastModified()); - Assert.assertTrue(Math.abs(1279591752000l - new File(vcfFile + RMDTrackBuilder.indexExtension).lastModified()) < 100); + Assert.assertTrue(Math.abs(1279591752000l - Tribble.indexFile(vcfFile).lastModified()) < 100); } @@ -86,7 +87,7 @@ public class RMDTrackBuilderUnitTest extends BaseTest { @Test public void testDirIsLockedIndexFromDisk() { File vcfFile = new File(validationDataLocation + "/ROD_validation/read_only/good_index.vcf"); - File vcfFileIndex = new File(validationDataLocation + "/ROD_validation/read_only/good_index.vcf.idx"); + File vcfFileIndex = Tribble.indexFile(vcfFile); Index ind = null; try { ind = builder.attemptIndexFromDisk(vcfFile,new VCFCodec(),vcfFileIndex,new FSLockWithShared(vcfFile)); @@ -102,7 +103,7 @@ public class RMDTrackBuilderUnitTest extends BaseTest { @Test public void testBuilderIndexDirectoryUnwritable() { File vcfFile = new File(validationDataLocation + "/ROD_validation/read_only/no_index.vcf"); - File vcfFileIndex = new File(validationDataLocation + "/ROD_validation/read_only/no_index.vcf.idx"); + File vcfFileIndex = Tribble.indexFile(vcfFile); Index ind = null; try { @@ -121,7 +122,7 @@ public class RMDTrackBuilderUnitTest extends BaseTest { @Test public void testGenerateIndexForUnindexedFile() { File vcfFile = new File(validationDataLocation + "/ROD_validation/always_reindex.vcf"); - File vcfFileIndex = new File(validationDataLocation + "/ROD_validation/always_reindex.vcf.idx"); + File vcfFileIndex = Tribble.indexFile(vcfFile); // if we can't write to the directory, don't fault the tester, just pass if (!vcfFileIndex.getParentFile().canWrite()) { @@ -147,7 +148,7 @@ public class RMDTrackBuilderUnitTest extends BaseTest { @Test public void testBuilderIndexSequenceDictionary() { File vcfFile = createCorrectDateIndexFile(new File(validationDataLocation + "/ROD_validation/newerTribbleTrack.vcf")); - Long indexTimeStamp = new File(vcfFile.getAbsolutePath() + ".idx").lastModified(); + Long indexTimeStamp = Tribble.indexFile(vcfFile).lastModified(); try { Index idx = builder.loadIndex(vcfFile, new VCFCodec()); RMDTrackBuilder.setIndexSequenceDictionary(idx,seq.getSequenceDictionary(),vcfFile,false); @@ -157,11 +158,9 @@ public class RMDTrackBuilderUnitTest extends BaseTest { e.printStackTrace(); Assert.fail("IO exception unexpected" + e.getMessage()); } - //System.err.println("index : " + new File(vcfFile + ".idx").lastModified()); - //System.err.println("old : " + indexTimeStamp); // make sure that we removed and updated the index - Assert.assertTrue("Fail: index file was modified", new File(vcfFile + ".idx").lastModified() == indexTimeStamp); + Assert.assertTrue("Fail: index file was modified", Tribble.indexFile(vcfFile).lastModified() == indexTimeStamp); } /** @@ -184,11 +183,11 @@ public class RMDTrackBuilderUnitTest extends BaseTest { Thread.sleep(2000); // create a fake index, before we copy so it's out of date - File tmpIndex = new File(tmpFile.getAbsolutePath() + ".idx"); + File tmpIndex = Tribble.indexFile(tmpFile); tmpIndex.deleteOnExit(); // copy the vcf (tribble) file to the tmp file location - copyFile(new File(tribbleFile + ".idx"), tmpIndex); + copyFile(Tribble.indexFile(tribbleFile), tmpIndex); return tmpFile; diff --git a/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFWriterUnitTest.java b/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFWriterUnitTest.java index e68a7cbb9..88ed22bdb 100644 --- a/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFWriterUnitTest.java +++ b/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFWriterUnitTest.java @@ -1,5 +1,6 @@ package org.broadinstitute.sting.utils.genotype.vcf; +import org.broad.tribble.Tribble; import org.broad.tribble.readers.AsciiLineReader; import org.broad.tribble.util.variantcontext.Allele; import org.broad.tribble.util.variantcontext.Genotype; @@ -76,7 +77,7 @@ public class VCFWriterUnitTest extends BaseTest { counter++; } Assert.assertEquals(2,counter); - new File(fakeVCFFile + RMDTrackBuilder.indexExtension).delete(); + Tribble.indexFile(fakeVCFFile).delete(); fakeVCFFile.delete(); } catch (IOException e ) {