High performance version of standard vcf writer. New general static Tribble class for common constants, including general .idx constant and functions to get standard index name for a given file.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4471 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
depristo 2010-10-08 19:53:21 +00:00
parent 6368a46bab
commit 38a67fed63
6 changed files with 86 additions and 54 deletions

View File

@ -25,6 +25,8 @@
package org.broadinstitute.sting.gatk.io.storage; package org.broadinstitute.sting.gatk.io.storage;
import org.broadinstitute.sting.gatk.AbstractGenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.io.stubs.Stub; import org.broadinstitute.sting.gatk.io.stubs.Stub;
import org.broadinstitute.sting.gatk.io.stubs.OutputStreamStub; import org.broadinstitute.sting.gatk.io.stubs.OutputStreamStub;
import org.broadinstitute.sting.gatk.io.stubs.SAMFileWriterStub; import org.broadinstitute.sting.gatk.io.stubs.SAMFileWriterStub;

View File

@ -1,10 +1,9 @@
package org.broadinstitute.sting.gatk.io.storage; package org.broadinstitute.sting.gatk.io.storage;
import org.broad.tribble.vcf.StandardVCFWriter; import org.broad.tribble.readers.LineReader;
import org.broad.tribble.vcf.VCFHeader; import org.broad.tribble.source.BasicFeatureSource;
import org.broad.tribble.vcf.VCFHeaderLine; import org.broad.tribble.vcf.*;
import org.broad.tribble.util.variantcontext.VariantContext; import org.broad.tribble.util.variantcontext.VariantContext;
import org.broad.tribble.vcf.VCFWriter;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.gatk.io.stubs.VCFWriterStub; import org.broadinstitute.sting.gatk.io.stubs.VCFWriterStub;
@ -12,6 +11,7 @@ import java.io.*;
import net.sf.samtools.util.BlockCompressedOutputStream; import net.sf.samtools.util.BlockCompressedOutputStream;
import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.text.XReadLines;
/** /**
* Provides temporary and permanent storage for genotypes in VCF format. * Provides temporary and permanent storage for genotypes in VCF format.
@ -21,7 +21,7 @@ import org.broadinstitute.sting.utils.exceptions.UserException;
*/ */
public class VCFWriterStorage implements Storage<VCFWriterStorage>, VCFWriter { public class VCFWriterStorage implements Storage<VCFWriterStorage>, VCFWriter {
protected final File file; protected final File file;
protected final OutputStream stream; protected OutputStream stream;
protected final VCFWriter writer; protected final VCFWriter writer;
/** /**
@ -30,29 +30,40 @@ public class VCFWriterStorage implements Storage<VCFWriterStorage>, VCFWriter {
* @param stub Stub to use when constructing the output file. * @param stub Stub to use when constructing the output file.
*/ */
public VCFWriterStorage( VCFWriterStub stub ) { public VCFWriterStorage( VCFWriterStub stub ) {
if ( stub.getFile() != null ) { if ( stub.getFile() != null ) {
file = stub.getFile(); this.file = stub.getFile();
try { writer = VCFWriterToFile(stub, stub.getFile());
if ( stub.isCompressed() )
stream = new BlockCompressedOutputStream(file);
else
stream = new PrintStream(file);
}
catch(IOException ex) {
throw new UserException.CouldNotCreateOutputFile(file, "Unable to open target output stream", ex);
}
} }
else if ( stub.getOutputStream() != null ) { else if ( stub.getOutputStream() != null ) {
this.file = null; this.file = null;
this.stream = stub.getOutputStream(); this.stream = stub.getOutputStream();
writer = new StandardVCFWriter(stream);
} }
else else
throw new ReviewedStingException("Unable to create target to which to write; storage was provided with neither a file nor a stream."); throw new ReviewedStingException("Unable to create target to which to write; storage was provided with neither a file nor a stream.");
writer = new StandardVCFWriter(stream);
} }
/**
* common initialization routine for multiple constructors
* @param stub
* @param file
* @return A VCF writer for use with this class
*/
private StandardVCFWriter VCFWriterToFile(VCFWriterStub stub, File file) {
try {
if ( stub.isCompressed() )
stream = new BlockCompressedOutputStream(file);
else
stream = new PrintStream(file);
}
catch(IOException ex) {
throw new UserException.CouldNotCreateOutputFile(file, "Unable to open target output stream", ex);
}
return new StandardVCFWriter(file, this.stream);
}
/** /**
* Constructs an object which will redirect into a different file. * Constructs an object which will redirect into a different file.
* @param stub Stub to use when synthesizing file / header info. * @param stub Stub to use when synthesizing file / header info.
@ -60,13 +71,7 @@ public class VCFWriterStorage implements Storage<VCFWriterStorage>, VCFWriter {
*/ */
public VCFWriterStorage(VCFWriterStub stub, File file) { public VCFWriterStorage(VCFWriterStub stub, File file) {
this.file = file; this.file = file;
try { this.writer = VCFWriterToFile(stub, file);
this.stream = new PrintStream(file);
}
catch(IOException ex) {
throw new UserException.CouldNotCreateOutputFile(file, "Unable to open target output stream",ex);
}
writer = new StandardVCFWriter(this.stream);
writer.writeHeader(stub.getVCFHeader()); writer.writeHeader(stub.getVCFHeader());
} }
@ -94,18 +99,31 @@ public class VCFWriterStorage implements Storage<VCFWriterStorage>, VCFWriter {
* Merges the stream backing up this temporary storage into the target. * Merges the stream backing up this temporary storage into the target.
* @param target Target stream for the temporary storage. May not be null. * @param target Target stream for the temporary storage. May not be null.
*/ */
// public void mergeInto(VCFWriterStorage target) {
// PrintStream formattingTarget = new PrintStream(target.stream);
// try {
// BufferedReader reader = new BufferedReader(new FileReader(file));
// String line = reader.readLine();
// while ( line != null ) {
// if (!VCFHeaderLine.isHeaderLine(line))
// formattingTarget.printf("%s%n",line);
// line = reader.readLine();
// }
//
// reader.close();
// } catch (IOException e) {
// throw new UserException.CouldNotReadInputFile(file, "Error reading file in VCFWriterStorage: ", e);
// }
// }
public void mergeInto(VCFWriterStorage target) { public void mergeInto(VCFWriterStorage target) {
PrintStream formattingTarget = new PrintStream(target.stream);
try { try {
BufferedReader reader = new BufferedReader(new FileReader(file)); BasicFeatureSource<VariantContext> source = BasicFeatureSource.getFeatureSource(file.getAbsolutePath(), new VCFCodec());
String line = reader.readLine();
while ( line != null ) { for ( VariantContext vc : source.iterator() ) {
if (!VCFHeaderLine.isHeaderLine(line)) target.writer.add(vc, vc.getReferenceBaseForIndel());
formattingTarget.printf("%s%n",line);
line = reader.readLine();
} }
reader.close(); source.close();
} catch (IOException e) { } catch (IOException e) {
throw new UserException.CouldNotReadInputFile(file, "Error reading file in VCFWriterStorage: ", e); throw new UserException.CouldNotReadInputFile(file, "Error reading file in VCFWriterStorage: ", e);
} }

View File

@ -69,9 +69,6 @@ public class RMDTrackBuilder extends PluginManager<FeatureCodec> {
// the input strings we use to create RODs from // the input strings we use to create RODs from
private final List<RMDTriplet> inputs = new ArrayList<RMDTriplet>(); private final List<RMDTriplet> inputs = new ArrayList<RMDTriplet>();
// the linear index extension
public static final String indexExtension = ".idx";
private Map<String, Class> classes = null; private Map<String, Class> classes = null;
// private sequence dictionary we use to set our tracks with // private sequence dictionary we use to set our tracks with
@ -201,7 +198,7 @@ public class RMDTrackBuilder extends PluginManager<FeatureCodec> {
// if we don't have a dictionary in the Tribble file, and we've set a dictionary for this builder, set it in the file if they match // if we don't have a dictionary in the Tribble file, and we've set a dictionary for this builder, set it in the file if they match
if (dictFromIndex.size() == 0 && dict != null) { if (dictFromIndex.size() == 0 && dict != null) {
File indexFile = indexFileForFile(inputFile); File indexFile = Tribble.indexFile(inputFile);
setIndexSequenceDictionary(index,dict,indexFile,true); setIndexSequenceDictionary(index,dict,indexFile,true);
dictFromIndex = getSequenceDictionaryFromProperties(index); dictFromIndex = getSequenceDictionaryFromProperties(index);
} }
@ -218,10 +215,6 @@ public class RMDTrackBuilder extends PluginManager<FeatureCodec> {
return reader; return reader;
} }
public static File indexFileForFile(File inputFile) {
return new File(inputFile.getAbsoluteFile() + indexExtension);
}
/** /**
* create an index for the input file * create an index for the input file
* @param inputFile the input file * @param inputFile the input file
@ -231,7 +224,7 @@ public class RMDTrackBuilder extends PluginManager<FeatureCodec> {
*/ */
public synchronized static Index loadIndex(File inputFile, FeatureCodec codec) throws IOException { public synchronized static Index loadIndex(File inputFile, FeatureCodec codec) throws IOException {
// create the index file name, locking on the index file name // create the index file name, locking on the index file name
File indexFile = indexFileForFile(inputFile); File indexFile = Tribble.indexFile(inputFile);
FSLockWithShared lock = new FSLockWithShared(indexFile); FSLockWithShared lock = new FSLockWithShared(indexFile);
// acquire a lock on the file // acquire a lock on the file

View File

@ -26,12 +26,16 @@
package org.broadinstitute.sting; package org.broadinstitute.sting;
import junit.framework.Assert; import junit.framework.Assert;
import org.broad.tribble.Tribble;
import org.broad.tribble.index.IndexFactory;
import org.broad.tribble.vcf.VCFCodec;
import org.broadinstitute.sting.gatk.CommandLineExecutable; import org.broadinstitute.sting.gatk.CommandLineExecutable;
import org.broadinstitute.sting.gatk.CommandLineGATK; import org.broadinstitute.sting.gatk.CommandLineGATK;
import org.broadinstitute.sting.utils.GenomeLocParserTestUtils; import org.broadinstitute.sting.utils.GenomeLocParserTestUtils;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.exceptions.StingException;
import org.junit.Test; import org.junit.Test;
import org.apache.commons.io.FileUtils; import org.apache.commons.io.FileUtils;
@ -139,10 +143,25 @@ public class WalkerTest extends BaseTest {
} }
} }
public void maybeValidateSupplementaryFile(final String name, final File resultFile) {
File indexFile = Tribble.indexFile(resultFile);
//System.out.println("Putative index file is " + indexFile);
if ( indexFile.exists() ) {
if ( resultFile.getAbsolutePath().contains(".vcf") ) {
// todo -- currently we only understand VCF files! Blow up since we can't test them
throw new StingException("Found an index created for file " + resultFile + " but we can only validate VCF files. Extend this code!");
}
System.out.println("Verifying on-the-fly index " + indexFile + " for test " + name + " using file " + resultFile);
Assert.assertTrue(IndexFactory.onDiskIndexEqualToNewlyCreatedIndex(resultFile, indexFile, new VCFCodec()));
}
}
public List<String> assertMatchingMD5s(final String name, List<File> resultFiles, List<String> expectedMD5s) { public List<String> assertMatchingMD5s(final String name, List<File> resultFiles, List<String> expectedMD5s) {
List<String> md5s = new ArrayList<String>(); List<String> md5s = new ArrayList<String>();
for (int i = 0; i < resultFiles.size(); i++) { for (int i = 0; i < resultFiles.size(); i++) {
String md5 = assertMatchingMD5(name, resultFiles.get(i), expectedMD5s.get(i)); String md5 = assertMatchingMD5(name, resultFiles.get(i), expectedMD5s.get(i));
maybeValidateSupplementaryFile(name, resultFiles.get(i));
md5s.add(i, md5); md5s.add(i, md5);
} }

View File

@ -27,6 +27,7 @@ package org.broadinstitute.sting.gatk.refdata.tracks.builders;
import net.sf.picard.reference.IndexedFastaSequenceFile; import net.sf.picard.reference.IndexedFastaSequenceFile;
import net.sf.samtools.SAMSequenceDictionary; import net.sf.samtools.SAMSequenceDictionary;
import net.sf.samtools.SAMSequenceRecord; import net.sf.samtools.SAMSequenceRecord;
import org.broad.tribble.Tribble;
import org.broad.tribble.index.Index; import org.broad.tribble.index.Index;
import org.broad.tribble.vcf.VCFCodec; import org.broad.tribble.vcf.VCFCodec;
import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.BaseTest;
@ -77,7 +78,7 @@ public class RMDTrackBuilderUnitTest extends BaseTest {
} }
// make sure we didn't write the file (check that it's timestamp is within bounds) // make sure we didn't write the file (check that it's timestamp is within bounds)
//System.err.println(new File(vcfFile + RMDTrackBuilder.indexExtension).lastModified()); //System.err.println(new File(vcfFile + RMDTrackBuilder.indexExtension).lastModified());
Assert.assertTrue(Math.abs(1279591752000l - new File(vcfFile + RMDTrackBuilder.indexExtension).lastModified()) < 100); Assert.assertTrue(Math.abs(1279591752000l - Tribble.indexFile(vcfFile).lastModified()) < 100);
} }
@ -86,7 +87,7 @@ public class RMDTrackBuilderUnitTest extends BaseTest {
@Test @Test
public void testDirIsLockedIndexFromDisk() { public void testDirIsLockedIndexFromDisk() {
File vcfFile = new File(validationDataLocation + "/ROD_validation/read_only/good_index.vcf"); File vcfFile = new File(validationDataLocation + "/ROD_validation/read_only/good_index.vcf");
File vcfFileIndex = new File(validationDataLocation + "/ROD_validation/read_only/good_index.vcf.idx"); File vcfFileIndex = Tribble.indexFile(vcfFile);
Index ind = null; Index ind = null;
try { try {
ind = builder.attemptIndexFromDisk(vcfFile,new VCFCodec(),vcfFileIndex,new FSLockWithShared(vcfFile)); ind = builder.attemptIndexFromDisk(vcfFile,new VCFCodec(),vcfFileIndex,new FSLockWithShared(vcfFile));
@ -102,7 +103,7 @@ public class RMDTrackBuilderUnitTest extends BaseTest {
@Test @Test
public void testBuilderIndexDirectoryUnwritable() { public void testBuilderIndexDirectoryUnwritable() {
File vcfFile = new File(validationDataLocation + "/ROD_validation/read_only/no_index.vcf"); File vcfFile = new File(validationDataLocation + "/ROD_validation/read_only/no_index.vcf");
File vcfFileIndex = new File(validationDataLocation + "/ROD_validation/read_only/no_index.vcf.idx"); File vcfFileIndex = Tribble.indexFile(vcfFile);
Index ind = null; Index ind = null;
try { try {
@ -121,7 +122,7 @@ public class RMDTrackBuilderUnitTest extends BaseTest {
@Test @Test
public void testGenerateIndexForUnindexedFile() { public void testGenerateIndexForUnindexedFile() {
File vcfFile = new File(validationDataLocation + "/ROD_validation/always_reindex.vcf"); File vcfFile = new File(validationDataLocation + "/ROD_validation/always_reindex.vcf");
File vcfFileIndex = new File(validationDataLocation + "/ROD_validation/always_reindex.vcf.idx"); File vcfFileIndex = Tribble.indexFile(vcfFile);
// if we can't write to the directory, don't fault the tester, just pass // if we can't write to the directory, don't fault the tester, just pass
if (!vcfFileIndex.getParentFile().canWrite()) { if (!vcfFileIndex.getParentFile().canWrite()) {
@ -147,7 +148,7 @@ public class RMDTrackBuilderUnitTest extends BaseTest {
@Test @Test
public void testBuilderIndexSequenceDictionary() { public void testBuilderIndexSequenceDictionary() {
File vcfFile = createCorrectDateIndexFile(new File(validationDataLocation + "/ROD_validation/newerTribbleTrack.vcf")); File vcfFile = createCorrectDateIndexFile(new File(validationDataLocation + "/ROD_validation/newerTribbleTrack.vcf"));
Long indexTimeStamp = new File(vcfFile.getAbsolutePath() + ".idx").lastModified(); Long indexTimeStamp = Tribble.indexFile(vcfFile).lastModified();
try { try {
Index idx = builder.loadIndex(vcfFile, new VCFCodec()); Index idx = builder.loadIndex(vcfFile, new VCFCodec());
RMDTrackBuilder.setIndexSequenceDictionary(idx,seq.getSequenceDictionary(),vcfFile,false); RMDTrackBuilder.setIndexSequenceDictionary(idx,seq.getSequenceDictionary(),vcfFile,false);
@ -157,11 +158,9 @@ public class RMDTrackBuilderUnitTest extends BaseTest {
e.printStackTrace(); e.printStackTrace();
Assert.fail("IO exception unexpected" + e.getMessage()); Assert.fail("IO exception unexpected" + e.getMessage());
} }
//System.err.println("index : " + new File(vcfFile + ".idx").lastModified());
//System.err.println("old : " + indexTimeStamp);
// make sure that we removed and updated the index // make sure that we removed and updated the index
Assert.assertTrue("Fail: index file was modified", new File(vcfFile + ".idx").lastModified() == indexTimeStamp); Assert.assertTrue("Fail: index file was modified", Tribble.indexFile(vcfFile).lastModified() == indexTimeStamp);
} }
/** /**
@ -184,11 +183,11 @@ public class RMDTrackBuilderUnitTest extends BaseTest {
Thread.sleep(2000); Thread.sleep(2000);
// create a fake index, before we copy so it's out of date // create a fake index, before we copy so it's out of date
File tmpIndex = new File(tmpFile.getAbsolutePath() + ".idx"); File tmpIndex = Tribble.indexFile(tmpFile);
tmpIndex.deleteOnExit(); tmpIndex.deleteOnExit();
// copy the vcf (tribble) file to the tmp file location // copy the vcf (tribble) file to the tmp file location
copyFile(new File(tribbleFile + ".idx"), tmpIndex); copyFile(Tribble.indexFile(tribbleFile), tmpIndex);
return tmpFile; return tmpFile;

View File

@ -1,5 +1,6 @@
package org.broadinstitute.sting.utils.genotype.vcf; package org.broadinstitute.sting.utils.genotype.vcf;
import org.broad.tribble.Tribble;
import org.broad.tribble.readers.AsciiLineReader; import org.broad.tribble.readers.AsciiLineReader;
import org.broad.tribble.util.variantcontext.Allele; import org.broad.tribble.util.variantcontext.Allele;
import org.broad.tribble.util.variantcontext.Genotype; import org.broad.tribble.util.variantcontext.Genotype;
@ -76,7 +77,7 @@ public class VCFWriterUnitTest extends BaseTest {
counter++; counter++;
} }
Assert.assertEquals(2,counter); Assert.assertEquals(2,counter);
new File(fakeVCFFile + RMDTrackBuilder.indexExtension).delete(); Tribble.indexFile(fakeVCFFile).delete();
fakeVCFFile.delete(); fakeVCFFile.delete();
} }
catch (IOException e ) { catch (IOException e ) {