diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/tracks/builders/TribbleRMDTrackBuilder.java b/java/src/org/broadinstitute/sting/gatk/refdata/tracks/builders/TribbleRMDTrackBuilder.java index 60436fb15..254aea5a1 100644 --- a/java/src/org/broadinstitute/sting/gatk/refdata/tracks/builders/TribbleRMDTrackBuilder.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/tracks/builders/TribbleRMDTrackBuilder.java @@ -26,13 +26,17 @@ package org.broadinstitute.sting.gatk.refdata.tracks.builders; import net.sf.samtools.SAMSequenceDictionary; +import net.sf.samtools.SAMSequenceRecord; import org.apache.log4j.Logger; import org.broad.tribble.*; import org.broad.tribble.index.Index; +import org.broad.tribble.index.IndexCreator; import org.broad.tribble.index.IndexFactory; -import org.broad.tribble.index.linear.LinearIndex; +import org.broad.tribble.index.interval.IntervalIndexCreator; import org.broad.tribble.index.linear.LinearIndexCreator; import org.broad.tribble.source.BasicFeatureSource; +import org.broad.tribble.util.LEDataOutputStream; +import org.broad.tribble.util.LEDataStreamUtils; import org.broad.tribble.vcf.NameAwareCodec; import org.broadinstitute.sting.gatk.refdata.tracks.TribbleTrack; import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack; @@ -45,8 +49,7 @@ import org.broadinstitute.sting.utils.file.FSLockWithShared; import java.io.File; import java.io.FileNotFoundException; import java.io.IOException; -import java.util.HashMap; -import java.util.Map; +import java.util.*; /** @@ -65,9 +68,11 @@ public class TribbleRMDTrackBuilder extends PluginManager implemen */ private static Logger logger = Logger.getLogger(TribbleRMDTrackBuilder.class); + // what index to use + static boolean useLinearIndex = true; // the linear index extension - public static final String linearIndexExtension = ".idx"; + public static final String indexExtension = ".idx"; /** Create a new plugin manager. */ public TribbleRMDTrackBuilder() { @@ -157,7 +162,10 @@ public class TribbleRMDTrackBuilder extends PluginManager implemen Pair reader; try { Index index = loadIndex(inputFile, createCodec(targetClass, name), true); - reader = new Pair(new BasicFeatureSource(inputFile.getAbsolutePath(), index, createCodec(targetClass, name)),index.getSequenceDictionary()); + reader = new Pair(new BasicFeatureSource(inputFile.getAbsolutePath(), + index, + createCodec(targetClass, name)), + sequenceSetToDictionary(index.getSequenceNames())); } catch (FileNotFoundException e) { throw new StingException("Unable to create reader with file " + inputFile, e); } catch (IOException e) { @@ -177,7 +185,7 @@ public class TribbleRMDTrackBuilder extends PluginManager implemen public synchronized static Index loadIndex(File inputFile, FeatureCodec codec, boolean onDisk) throws IOException { // create the index file name, locking on the index file name - File indexFile = new File(inputFile.getAbsoluteFile() + linearIndexExtension); + File indexFile = new File(inputFile.getAbsoluteFile() + indexExtension); FSLockWithShared lock = new FSLockWithShared(indexFile); // acquire a lock on the file @@ -259,7 +267,9 @@ public class TribbleRMDTrackBuilder extends PluginManager implemen locked = lock.exclusiveLock(); if (locked) { logger.info("Writing Tribble index to disk for file " + inputFile); - index.write(indexFile); + LEDataOutputStream stream = LEDataStreamUtils.createOutputStream(indexFile); + index.write(stream); + stream.close(); } else // we can't write it to disk, just store it in memory, tell them this if (onDisk) logger.info("Unable to write to " + indexFile + " for the index file, creating index in memory only"); @@ -280,7 +290,28 @@ public class TribbleRMDTrackBuilder extends PluginManager implemen private static Index createIndexInMemory(File inputFile, FeatureCodec codec) throws IOException { // this can take a while, let them know what we're doing logger.info("Creating Tribble index in memory for file " + inputFile); - LinearIndexCreator creator = new LinearIndexCreator(inputFile,codec,null); + IndexCreator creator; + if (useLinearIndex) + creator = new LinearIndexCreator(inputFile,codec,null); + else + creator = new IntervalIndexCreator(inputFile, codec, null); return creator.createIndex(); } + + /** + * convert a list of Strings into a sequence dictionary + * @param contigList the contig list, in coordinate order, this is allowed to be null + * @return a SAMSequenceDictionary, WITHOUT contig sizes + */ + private static final SAMSequenceDictionary sequenceSetToDictionary(LinkedHashSet contigList) { + SAMSequenceDictionary dict = new SAMSequenceDictionary(); + if (contigList == null) return dict; + + for (String name : contigList) { + SAMSequenceRecord seq = new SAMSequenceRecord(name, 0); + dict.addSequence(seq); + } + return dict; + } + } diff --git a/java/test/org/broadinstitute/sting/gatk/refdata/tracks/builders/IndexPerformanceTests.java b/java/test/org/broadinstitute/sting/gatk/refdata/tracks/builders/IndexPerformanceTests.java new file mode 100644 index 000000000..25fd04b60 --- /dev/null +++ b/java/test/org/broadinstitute/sting/gatk/refdata/tracks/builders/IndexPerformanceTests.java @@ -0,0 +1,175 @@ +package org.broadinstitute.sting.gatk.refdata.tracks.builders; + +import net.sf.picard.reference.IndexedFastaSequenceFile; +import net.sf.samtools.SAMSequenceDictionary; +import org.apache.log4j.Level; +import org.apache.log4j.Logger; +import org.broad.tribble.Feature; +import org.broad.tribble.index.Index; +import org.broad.tribble.index.linear.LinearIndex; +import org.broad.tribble.iterators.CloseableTribbleIterator; +import org.broad.tribble.source.BasicFeatureSource; +import org.broad.tribble.vcf.VCF3Codec; +import org.broad.tribble.vcf.VCFCodec; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.gatk.refdata.features.annotator.AnnotatorInputTableCodec; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.collections.Pair; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; + +import java.io.File; +import java.io.FileWriter; +import java.io.IOException; +import java.io.PrintWriter; +import java.util.*; + +/** + * performance tests for different index types + */ +public class IndexPerformanceTests extends BaseTest { + // the RMD track builder + private TribbleRMDTrackBuilder builder; + + // set the logger level + Logger logger = Logger.getLogger(IndexPerformanceTests.class); + + // the input files to test + Map inputFiles = new LinkedHashMap(); + + // the input types + Map inputTypes = new HashMap(); + + PrintWriter writer; + PrintWriter writer2; + /** setup the files we're going to run with, including their names */ + @Before + public void setupFilesAndIndexes() { + logger.setLevel(Level.INFO); + builder = new TribbleRMDTrackBuilder(); + IndexedFastaSequenceFile seq = new IndexedFastaSequenceFile(new File(hg18Reference)); + GenomeLocParser.setupRefContigOrdering(seq); + + // the input files + inputFiles.put("\"10\"",new File("tip10.vcf")); + inputFiles.put("\"100\"",new File("tip100.vcf")); + inputFiles.put("\"1,000\"",new File("tip1000.vcf")); + inputFiles.put("\"10,000\"",new File("tip10000.vcf")); + inputFiles.put("\"100,000\"",new File("tip100000.vcf")); + inputFiles.put("\"1,000,000\"",new File("tip1000000.vcf")); + + for (String name : inputFiles.keySet()) { + inputTypes.put(name,VCFCodec.class); + } + inputFiles.put("Big Table",new File("/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/slowAnnotator/big.table.txt")); + inputTypes.put("Big Table", AnnotatorInputTableCodec.class); + /*inputFiles.put("100", new File("1000.vcf")); + inputFiles.put("Medium (100K) VCF",new File("100K.vcf")); + inputFiles.put("Big Table",new File("/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/slowAnnotator/big.table.txt")); + inputFiles.put("Huge (1M) VCF",new File("1M.vcf")); + // the input types + inputTypes.put("Huge (1M) VCF", VCFCodec.class); + inputTypes.put("Medium (100K) VCF", VCFCodec.class); + inputTypes.put("1000 records VCF", VCFCodec.class); + inputTypes.put("Big Table", AnnotatorInputTableCodec.class);*/ + } + + @Test + public void emptyTest() { + // do nothing + } + + //@Test + public void performanceTest() { + try { + writer = new PrintWriter(new FileWriter("testOutput_linear.txt")); + writer2 = new PrintWriter(new FileWriter("testOutput_tree.txt")); + } catch (IOException e) { + Assert.fail("Unable to open file testOutput.txt"); + } + writer.println("name,index,createTime,seekTime,thousandPerThousand,record_count,index_size"); + writer2.println("name,index,createTime,seekTime,thousandPerThousand,record_count,index_size"); + for (String name : inputFiles.keySet()) { + System.err.println("running " + name + " with linear index"); + printTestLine(name,true); + System.err.println("running " + name + " with tree index"); + printTestLine(name,false); + } + writer.close(); + writer2.close(); + } + + private void printTestLine(String name, boolean useLinear) { + PrintWriter wr = (useLinear) ? writer : writer2; + List values = performIndexTest(name,useLinear); + wr.print(name + "," + ((useLinear) ? "linear" : "tree")); + for (Long l : values) { + wr.print(","); + wr.print(l); + } + wr.println(); + } + + /** + * time various tasks using the specified index + * @param name the name to get + * @return a five-piece: the time to create the index, the time to seek to chromosome 1, and the time to process reading + * every other 1000 bases of chr1 (of the first 100M), the count of records seen in the last oepration, and the index size + */ + public List performIndexTest(String name, boolean useLinear) { + TribbleRMDTrackBuilder.useLinearIndex = useLinear; + deleteIndex(inputFiles.get(name)); + // time creating the index + long createTime = System.currentTimeMillis(); + Pair pairing = builder.createFeatureReader(inputTypes.get(name),inputFiles.get(name)); + createTime = System.currentTimeMillis() - createTime; + System.err.println("index creation took " + createTime); + + // seek to chr1 + long seekTo1 = seekToChr1(pairing); + + // seek every 1000 bases in Chr1 + long count = 0; + long thousandEveryThousand = System.currentTimeMillis(); + try { + for (int x = 1; x < 1000000; x = x + 1000) { + //CloseableTribbleIterator iter = pairing.first.query("chr1", x+(int)Math.floor(Math.random()*1000), x+1000); // query + CloseableTribbleIterator iter = pairing.first.query("chr1", x, x+1000); // query + for (Feature feat : iter) { + count++; + } + } + + } catch (IOException e) { + Assert.fail("Unable to load file for query!!"); + } + thousandEveryThousand = System.currentTimeMillis() - thousandEveryThousand; + System.err.println("thousand every thousand (for first million) took " + thousandEveryThousand); + return Arrays.asList(createTime,seekTo1,thousandEveryThousand,count,new File(inputFiles.get(name) + ".idx").length()); + } + + private long seekToChr1(Pair pairing) { + // time seeking to the first 1M bases of Chr1 + long seekTo1 = System.currentTimeMillis(); + try { + CloseableTribbleIterator iter = pairing.first.query("chr1",1,10000000); // query + } catch (IOException e) { + Assert.fail("Unable to load file for query!!"); + } + seekTo1 = System.currentTimeMillis() - seekTo1; + System.err.println("seeking to chr1 took " + seekTo1); + return seekTo1; + } + + + private void deleteIndex(File fl) { + File indexFile = new File(fl + TribbleRMDTrackBuilder.indexExtension); + boolean deleted = true; + if (indexFile.exists()) + deleted = indexFile.delete(); + if (!deleted) + Assert.fail("Unable to delete index file"); + } + +} diff --git a/java/test/org/broadinstitute/sting/gatk/refdata/tracks/builders/TribbleRMDTrackBuilderUnitTest.java b/java/test/org/broadinstitute/sting/gatk/refdata/tracks/builders/TribbleRMDTrackBuilderUnitTest.java index 262c32b87..c988dae57 100644 --- a/java/test/org/broadinstitute/sting/gatk/refdata/tracks/builders/TribbleRMDTrackBuilderUnitTest.java +++ b/java/test/org/broadinstitute/sting/gatk/refdata/tracks/builders/TribbleRMDTrackBuilderUnitTest.java @@ -73,8 +73,8 @@ public class TribbleRMDTrackBuilderUnitTest extends BaseTest { Assert.fail("IO exception unexpected" + e.getMessage()); } // make sure we didn't write the file (check that it's timestamp is within bounds) - //System.err.println(new File(vcfFile + TribbleRMDTrackBuilder.linearIndexExtension).lastModified()); - Assert.assertTrue(Math.abs(1279591752000l - new File(vcfFile + TribbleRMDTrackBuilder.linearIndexExtension).lastModified()) < 100); + //System.err.println(new File(vcfFile + TribbleRMDTrackBuilder.indexExtension).lastModified()); + Assert.assertTrue(Math.abs(1279591752000l - new File(vcfFile + TribbleRMDTrackBuilder.indexExtension).lastModified()) < 100); } diff --git a/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFWriterUnitTest.java b/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFWriterUnitTest.java index 33f69adfb..1cbea1029 100644 --- a/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFWriterUnitTest.java +++ b/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFWriterUnitTest.java @@ -76,7 +76,7 @@ public class VCFWriterUnitTest extends BaseTest { counter++; } Assert.assertEquals(2,counter); - new File(fakeVCFFile + TribbleRMDTrackBuilder.linearIndexExtension).delete(); + new File(fakeVCFFile + TribbleRMDTrackBuilder.indexExtension).delete(); fakeVCFFile.delete(); } catch (IOException e ) { diff --git a/settings/repository/org.broad/tribble-122M.jar b/settings/repository/org.broad/tribble-124M.jar similarity index 78% rename from settings/repository/org.broad/tribble-122M.jar rename to settings/repository/org.broad/tribble-124M.jar index a39750e2f..51780b5cb 100644 Binary files a/settings/repository/org.broad/tribble-122M.jar and b/settings/repository/org.broad/tribble-124M.jar differ diff --git a/settings/repository/org.broad/tribble-122M.xml b/settings/repository/org.broad/tribble-124M.xml similarity index 64% rename from settings/repository/org.broad/tribble-122M.xml rename to settings/repository/org.broad/tribble-124M.xml index 3d74b4640..ffb0daf3d 100644 --- a/settings/repository/org.broad/tribble-122M.xml +++ b/settings/repository/org.broad/tribble-124M.xml @@ -1,3 +1,3 @@ - +