High performance version of standard vcf writer. New general static Tribble class for common constants, including general .idx constant and functions to get standard index name for a given file.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4471 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
depristo 2010-10-08 19:53:21 +00:00
parent 6368a46bab
commit 38a67fed63
6 changed files with 86 additions and 54 deletions

View File

@ -25,6 +25,8 @@
package org.broadinstitute.sting.gatk.io.storage;
import org.broadinstitute.sting.gatk.AbstractGenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.io.stubs.Stub;
import org.broadinstitute.sting.gatk.io.stubs.OutputStreamStub;
import org.broadinstitute.sting.gatk.io.stubs.SAMFileWriterStub;

View File

@ -1,10 +1,9 @@
package org.broadinstitute.sting.gatk.io.storage;
import org.broad.tribble.vcf.StandardVCFWriter;
import org.broad.tribble.vcf.VCFHeader;
import org.broad.tribble.vcf.VCFHeaderLine;
import org.broad.tribble.readers.LineReader;
import org.broad.tribble.source.BasicFeatureSource;
import org.broad.tribble.vcf.*;
import org.broad.tribble.util.variantcontext.VariantContext;
import org.broad.tribble.vcf.VCFWriter;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.gatk.io.stubs.VCFWriterStub;
@ -12,6 +11,7 @@ import java.io.*;
import net.sf.samtools.util.BlockCompressedOutputStream;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.text.XReadLines;
/**
* Provides temporary and permanent storage for genotypes in VCF format.
@ -21,7 +21,7 @@ import org.broadinstitute.sting.utils.exceptions.UserException;
*/
public class VCFWriterStorage implements Storage<VCFWriterStorage>, VCFWriter {
protected final File file;
protected final OutputStream stream;
protected OutputStream stream;
protected final VCFWriter writer;
/**
@ -30,29 +30,40 @@ public class VCFWriterStorage implements Storage<VCFWriterStorage>, VCFWriter {
* @param stub Stub to use when constructing the output file.
*/
public VCFWriterStorage( VCFWriterStub stub ) {
if ( stub.getFile() != null ) {
file = stub.getFile();
try {
if ( stub.isCompressed() )
stream = new BlockCompressedOutputStream(file);
else
stream = new PrintStream(file);
}
catch(IOException ex) {
throw new UserException.CouldNotCreateOutputFile(file, "Unable to open target output stream", ex);
}
this.file = stub.getFile();
writer = VCFWriterToFile(stub, stub.getFile());
}
else if ( stub.getOutputStream() != null ) {
this.file = null;
this.stream = stub.getOutputStream();
writer = new StandardVCFWriter(stream);
}
else
throw new ReviewedStingException("Unable to create target to which to write; storage was provided with neither a file nor a stream.");
writer = new StandardVCFWriter(stream);
}
/**
* common initialization routine for multiple constructors
* @param stub
* @param file
* @return A VCF writer for use with this class
*/
private StandardVCFWriter VCFWriterToFile(VCFWriterStub stub, File file) {
try {
if ( stub.isCompressed() )
stream = new BlockCompressedOutputStream(file);
else
stream = new PrintStream(file);
}
catch(IOException ex) {
throw new UserException.CouldNotCreateOutputFile(file, "Unable to open target output stream", ex);
}
return new StandardVCFWriter(file, this.stream);
}
/**
* Constructs an object which will redirect into a different file.
* @param stub Stub to use when synthesizing file / header info.
@ -60,13 +71,7 @@ public class VCFWriterStorage implements Storage<VCFWriterStorage>, VCFWriter {
*/
public VCFWriterStorage(VCFWriterStub stub, File file) {
this.file = file;
try {
this.stream = new PrintStream(file);
}
catch(IOException ex) {
throw new UserException.CouldNotCreateOutputFile(file, "Unable to open target output stream",ex);
}
writer = new StandardVCFWriter(this.stream);
this.writer = VCFWriterToFile(stub, file);
writer.writeHeader(stub.getVCFHeader());
}
@ -94,20 +99,33 @@ public class VCFWriterStorage implements Storage<VCFWriterStorage>, VCFWriter {
* Merges the stream backing up this temporary storage into the target.
* @param target Target stream for the temporary storage. May not be null.
*/
// public void mergeInto(VCFWriterStorage target) {
// PrintStream formattingTarget = new PrintStream(target.stream);
// try {
// BufferedReader reader = new BufferedReader(new FileReader(file));
// String line = reader.readLine();
// while ( line != null ) {
// if (!VCFHeaderLine.isHeaderLine(line))
// formattingTarget.printf("%s%n",line);
// line = reader.readLine();
// }
//
// reader.close();
// } catch (IOException e) {
// throw new UserException.CouldNotReadInputFile(file, "Error reading file in VCFWriterStorage: ", e);
// }
// }
public void mergeInto(VCFWriterStorage target) {
PrintStream formattingTarget = new PrintStream(target.stream);
try {
BufferedReader reader = new BufferedReader(new FileReader(file));
String line = reader.readLine();
while ( line != null ) {
if (!VCFHeaderLine.isHeaderLine(line))
formattingTarget.printf("%s%n",line);
line = reader.readLine();
BasicFeatureSource<VariantContext> source = BasicFeatureSource.getFeatureSource(file.getAbsolutePath(), new VCFCodec());
for ( VariantContext vc : source.iterator() ) {
target.writer.add(vc, vc.getReferenceBaseForIndel());
}
reader.close();
source.close();
} catch (IOException e) {
throw new UserException.CouldNotReadInputFile(file, "Error reading file in VCFWriterStorage: ", e);
}
}
}
}

View File

@ -69,9 +69,6 @@ public class RMDTrackBuilder extends PluginManager<FeatureCodec> {
// the input strings we use to create RODs from
private final List<RMDTriplet> inputs = new ArrayList<RMDTriplet>();
// the linear index extension
public static final String indexExtension = ".idx";
private Map<String, Class> classes = null;
// private sequence dictionary we use to set our tracks with
@ -201,7 +198,7 @@ public class RMDTrackBuilder extends PluginManager<FeatureCodec> {
// if we don't have a dictionary in the Tribble file, and we've set a dictionary for this builder, set it in the file if they match
if (dictFromIndex.size() == 0 && dict != null) {
File indexFile = indexFileForFile(inputFile);
File indexFile = Tribble.indexFile(inputFile);
setIndexSequenceDictionary(index,dict,indexFile,true);
dictFromIndex = getSequenceDictionaryFromProperties(index);
}
@ -218,10 +215,6 @@ public class RMDTrackBuilder extends PluginManager<FeatureCodec> {
return reader;
}
public static File indexFileForFile(File inputFile) {
return new File(inputFile.getAbsoluteFile() + indexExtension);
}
/**
* create an index for the input file
* @param inputFile the input file
@ -231,7 +224,7 @@ public class RMDTrackBuilder extends PluginManager<FeatureCodec> {
*/
public synchronized static Index loadIndex(File inputFile, FeatureCodec codec) throws IOException {
// create the index file name, locking on the index file name
File indexFile = indexFileForFile(inputFile);
File indexFile = Tribble.indexFile(inputFile);
FSLockWithShared lock = new FSLockWithShared(indexFile);
// acquire a lock on the file

View File

@ -26,12 +26,16 @@
package org.broadinstitute.sting;
import junit.framework.Assert;
import org.broad.tribble.Tribble;
import org.broad.tribble.index.IndexFactory;
import org.broad.tribble.vcf.VCFCodec;
import org.broadinstitute.sting.gatk.CommandLineExecutable;
import org.broadinstitute.sting.gatk.CommandLineGATK;
import org.broadinstitute.sting.utils.GenomeLocParserTestUtils;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.exceptions.StingException;
import org.junit.Test;
import org.apache.commons.io.FileUtils;
@ -139,10 +143,25 @@ public class WalkerTest extends BaseTest {
}
}
public void maybeValidateSupplementaryFile(final String name, final File resultFile) {
File indexFile = Tribble.indexFile(resultFile);
//System.out.println("Putative index file is " + indexFile);
if ( indexFile.exists() ) {
if ( resultFile.getAbsolutePath().contains(".vcf") ) {
// todo -- currently we only understand VCF files! Blow up since we can't test them
throw new StingException("Found an index created for file " + resultFile + " but we can only validate VCF files. Extend this code!");
}
System.out.println("Verifying on-the-fly index " + indexFile + " for test " + name + " using file " + resultFile);
Assert.assertTrue(IndexFactory.onDiskIndexEqualToNewlyCreatedIndex(resultFile, indexFile, new VCFCodec()));
}
}
public List<String> assertMatchingMD5s(final String name, List<File> resultFiles, List<String> expectedMD5s) {
List<String> md5s = new ArrayList<String>();
for (int i = 0; i < resultFiles.size(); i++) {
String md5 = assertMatchingMD5(name, resultFiles.get(i), expectedMD5s.get(i));
maybeValidateSupplementaryFile(name, resultFiles.get(i));
md5s.add(i, md5);
}

View File

@ -27,6 +27,7 @@ package org.broadinstitute.sting.gatk.refdata.tracks.builders;
import net.sf.picard.reference.IndexedFastaSequenceFile;
import net.sf.samtools.SAMSequenceDictionary;
import net.sf.samtools.SAMSequenceRecord;
import org.broad.tribble.Tribble;
import org.broad.tribble.index.Index;
import org.broad.tribble.vcf.VCFCodec;
import org.broadinstitute.sting.BaseTest;
@ -77,7 +78,7 @@ public class RMDTrackBuilderUnitTest extends BaseTest {
}
// make sure we didn't write the file (check that it's timestamp is within bounds)
//System.err.println(new File(vcfFile + RMDTrackBuilder.indexExtension).lastModified());
Assert.assertTrue(Math.abs(1279591752000l - new File(vcfFile + RMDTrackBuilder.indexExtension).lastModified()) < 100);
Assert.assertTrue(Math.abs(1279591752000l - Tribble.indexFile(vcfFile).lastModified()) < 100);
}
@ -86,7 +87,7 @@ public class RMDTrackBuilderUnitTest extends BaseTest {
@Test
public void testDirIsLockedIndexFromDisk() {
File vcfFile = new File(validationDataLocation + "/ROD_validation/read_only/good_index.vcf");
File vcfFileIndex = new File(validationDataLocation + "/ROD_validation/read_only/good_index.vcf.idx");
File vcfFileIndex = Tribble.indexFile(vcfFile);
Index ind = null;
try {
ind = builder.attemptIndexFromDisk(vcfFile,new VCFCodec(),vcfFileIndex,new FSLockWithShared(vcfFile));
@ -102,7 +103,7 @@ public class RMDTrackBuilderUnitTest extends BaseTest {
@Test
public void testBuilderIndexDirectoryUnwritable() {
File vcfFile = new File(validationDataLocation + "/ROD_validation/read_only/no_index.vcf");
File vcfFileIndex = new File(validationDataLocation + "/ROD_validation/read_only/no_index.vcf.idx");
File vcfFileIndex = Tribble.indexFile(vcfFile);
Index ind = null;
try {
@ -121,7 +122,7 @@ public class RMDTrackBuilderUnitTest extends BaseTest {
@Test
public void testGenerateIndexForUnindexedFile() {
File vcfFile = new File(validationDataLocation + "/ROD_validation/always_reindex.vcf");
File vcfFileIndex = new File(validationDataLocation + "/ROD_validation/always_reindex.vcf.idx");
File vcfFileIndex = Tribble.indexFile(vcfFile);
// if we can't write to the directory, don't fault the tester, just pass
if (!vcfFileIndex.getParentFile().canWrite()) {
@ -147,7 +148,7 @@ public class RMDTrackBuilderUnitTest extends BaseTest {
@Test
public void testBuilderIndexSequenceDictionary() {
File vcfFile = createCorrectDateIndexFile(new File(validationDataLocation + "/ROD_validation/newerTribbleTrack.vcf"));
Long indexTimeStamp = new File(vcfFile.getAbsolutePath() + ".idx").lastModified();
Long indexTimeStamp = Tribble.indexFile(vcfFile).lastModified();
try {
Index idx = builder.loadIndex(vcfFile, new VCFCodec());
RMDTrackBuilder.setIndexSequenceDictionary(idx,seq.getSequenceDictionary(),vcfFile,false);
@ -157,11 +158,9 @@ public class RMDTrackBuilderUnitTest extends BaseTest {
e.printStackTrace();
Assert.fail("IO exception unexpected" + e.getMessage());
}
//System.err.println("index : " + new File(vcfFile + ".idx").lastModified());
//System.err.println("old : " + indexTimeStamp);
// make sure that we removed and updated the index
Assert.assertTrue("Fail: index file was modified", new File(vcfFile + ".idx").lastModified() == indexTimeStamp);
Assert.assertTrue("Fail: index file was modified", Tribble.indexFile(vcfFile).lastModified() == indexTimeStamp);
}
/**
@ -184,11 +183,11 @@ public class RMDTrackBuilderUnitTest extends BaseTest {
Thread.sleep(2000);
// create a fake index, before we copy so it's out of date
File tmpIndex = new File(tmpFile.getAbsolutePath() + ".idx");
File tmpIndex = Tribble.indexFile(tmpFile);
tmpIndex.deleteOnExit();
// copy the vcf (tribble) file to the tmp file location
copyFile(new File(tribbleFile + ".idx"), tmpIndex);
copyFile(Tribble.indexFile(tribbleFile), tmpIndex);
return tmpFile;

View File

@ -1,5 +1,6 @@
package org.broadinstitute.sting.utils.genotype.vcf;
import org.broad.tribble.Tribble;
import org.broad.tribble.readers.AsciiLineReader;
import org.broad.tribble.util.variantcontext.Allele;
import org.broad.tribble.util.variantcontext.Genotype;
@ -76,7 +77,7 @@ public class VCFWriterUnitTest extends BaseTest {
counter++;
}
Assert.assertEquals(2,counter);
new File(fakeVCFFile + RMDTrackBuilder.indexExtension).delete();
Tribble.indexFile(fakeVCFFile).delete();
fakeVCFFile.delete();
}
catch (IOException e ) {