updates to code dependent on Tribble, as well as the following Tribble changes:

- makes writing to disk optional for indexes using the indexCreator classes (allow the user to specify the index file, if null don't write it)
- removed some system.out debugging code
- fixed version checking in interval tree 
- made indexes store and return a LinkedHashSet for sequence names (to ensure they've preserved the ordering in the file)
- index creators now read the file before creating the index
- changed the Index.write() method to take a LEDataStream instead of a file
- removed the sequence dictionary code on the header
- added utils for getting LEDataStreams
- added a base Tribble exception




git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3857 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
aaron 2010-07-23 01:56:10 +00:00
parent c5325b03be
commit 9579aace1f
6 changed files with 218 additions and 12 deletions

View File

@ -26,13 +26,17 @@
package org.broadinstitute.sting.gatk.refdata.tracks.builders;
import net.sf.samtools.SAMSequenceDictionary;
import net.sf.samtools.SAMSequenceRecord;
import org.apache.log4j.Logger;
import org.broad.tribble.*;
import org.broad.tribble.index.Index;
import org.broad.tribble.index.IndexCreator;
import org.broad.tribble.index.IndexFactory;
import org.broad.tribble.index.linear.LinearIndex;
import org.broad.tribble.index.interval.IntervalIndexCreator;
import org.broad.tribble.index.linear.LinearIndexCreator;
import org.broad.tribble.source.BasicFeatureSource;
import org.broad.tribble.util.LEDataOutputStream;
import org.broad.tribble.util.LEDataStreamUtils;
import org.broad.tribble.vcf.NameAwareCodec;
import org.broadinstitute.sting.gatk.refdata.tracks.TribbleTrack;
import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack;
@ -45,8 +49,7 @@ import org.broadinstitute.sting.utils.file.FSLockWithShared;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import java.util.*;
/**
@ -65,9 +68,11 @@ public class TribbleRMDTrackBuilder extends PluginManager<FeatureCodec> implemen
*/
private static Logger logger = Logger.getLogger(TribbleRMDTrackBuilder.class);
// what index to use
static boolean useLinearIndex = true;
// the linear index extension
public static final String linearIndexExtension = ".idx";
public static final String indexExtension = ".idx";
/** Create a new plugin manager. */
public TribbleRMDTrackBuilder() {
@ -157,7 +162,10 @@ public class TribbleRMDTrackBuilder extends PluginManager<FeatureCodec> implemen
Pair<BasicFeatureSource, SAMSequenceDictionary> reader;
try {
Index index = loadIndex(inputFile, createCodec(targetClass, name), true);
reader = new Pair<BasicFeatureSource, SAMSequenceDictionary>(new BasicFeatureSource(inputFile.getAbsolutePath(), index, createCodec(targetClass, name)),index.getSequenceDictionary());
reader = new Pair<BasicFeatureSource, SAMSequenceDictionary>(new BasicFeatureSource(inputFile.getAbsolutePath(),
index,
createCodec(targetClass, name)),
sequenceSetToDictionary(index.getSequenceNames()));
} catch (FileNotFoundException e) {
throw new StingException("Unable to create reader with file " + inputFile, e);
} catch (IOException e) {
@ -177,7 +185,7 @@ public class TribbleRMDTrackBuilder extends PluginManager<FeatureCodec> implemen
public synchronized static Index loadIndex(File inputFile, FeatureCodec codec, boolean onDisk) throws IOException {
// create the index file name, locking on the index file name
File indexFile = new File(inputFile.getAbsoluteFile() + linearIndexExtension);
File indexFile = new File(inputFile.getAbsoluteFile() + indexExtension);
FSLockWithShared lock = new FSLockWithShared(indexFile);
// acquire a lock on the file
@ -259,7 +267,9 @@ public class TribbleRMDTrackBuilder extends PluginManager<FeatureCodec> implemen
locked = lock.exclusiveLock();
if (locked) {
logger.info("Writing Tribble index to disk for file " + inputFile);
index.write(indexFile);
LEDataOutputStream stream = LEDataStreamUtils.createOutputStream(indexFile);
index.write(stream);
stream.close();
}
else // we can't write it to disk, just store it in memory, tell them this
if (onDisk) logger.info("Unable to write to " + indexFile + " for the index file, creating index in memory only");
@ -280,7 +290,28 @@ public class TribbleRMDTrackBuilder extends PluginManager<FeatureCodec> implemen
private static Index createIndexInMemory(File inputFile, FeatureCodec codec) throws IOException {
// this can take a while, let them know what we're doing
logger.info("Creating Tribble index in memory for file " + inputFile);
LinearIndexCreator creator = new LinearIndexCreator(inputFile,codec,null);
IndexCreator creator;
if (useLinearIndex)
creator = new LinearIndexCreator(inputFile,codec,null);
else
creator = new IntervalIndexCreator(inputFile, codec, null);
return creator.createIndex();
}
/**
* convert a list of Strings into a sequence dictionary
* @param contigList the contig list, in coordinate order, this is allowed to be null
* @return a SAMSequenceDictionary, WITHOUT contig sizes
*/
private static final SAMSequenceDictionary sequenceSetToDictionary(LinkedHashSet<String> contigList) {
SAMSequenceDictionary dict = new SAMSequenceDictionary();
if (contigList == null) return dict;
for (String name : contigList) {
SAMSequenceRecord seq = new SAMSequenceRecord(name, 0);
dict.addSequence(seq);
}
return dict;
}
}

View File

@ -0,0 +1,175 @@
package org.broadinstitute.sting.gatk.refdata.tracks.builders;
import net.sf.picard.reference.IndexedFastaSequenceFile;
import net.sf.samtools.SAMSequenceDictionary;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.broad.tribble.Feature;
import org.broad.tribble.index.Index;
import org.broad.tribble.index.linear.LinearIndex;
import org.broad.tribble.iterators.CloseableTribbleIterator;
import org.broad.tribble.source.BasicFeatureSource;
import org.broad.tribble.vcf.VCF3Codec;
import org.broad.tribble.vcf.VCFCodec;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.gatk.refdata.features.annotator.AnnotatorInputTableCodec;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.collections.Pair;
import org.junit.Assert;
import org.junit.Before;
import org.junit.Test;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.*;
/**
* performance tests for different index types
*/
public class IndexPerformanceTests extends BaseTest {
// the RMD track builder
private TribbleRMDTrackBuilder builder;
// set the logger level
Logger logger = Logger.getLogger(IndexPerformanceTests.class);
// the input files to test
Map<String, File> inputFiles = new LinkedHashMap<String,File>();
// the input types
Map<String, Class> inputTypes = new HashMap<String,Class>();
PrintWriter writer;
PrintWriter writer2;
/** setup the files we're going to run with, including their names */
@Before
public void setupFilesAndIndexes() {
logger.setLevel(Level.INFO);
builder = new TribbleRMDTrackBuilder();
IndexedFastaSequenceFile seq = new IndexedFastaSequenceFile(new File(hg18Reference));
GenomeLocParser.setupRefContigOrdering(seq);
// the input files
inputFiles.put("\"10\"",new File("tip10.vcf"));
inputFiles.put("\"100\"",new File("tip100.vcf"));
inputFiles.put("\"1,000\"",new File("tip1000.vcf"));
inputFiles.put("\"10,000\"",new File("tip10000.vcf"));
inputFiles.put("\"100,000\"",new File("tip100000.vcf"));
inputFiles.put("\"1,000,000\"",new File("tip1000000.vcf"));
for (String name : inputFiles.keySet()) {
inputTypes.put(name,VCFCodec.class);
}
inputFiles.put("Big Table",new File("/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/slowAnnotator/big.table.txt"));
inputTypes.put("Big Table", AnnotatorInputTableCodec.class);
/*inputFiles.put("100", new File("1000.vcf"));
inputFiles.put("Medium (100K) VCF",new File("100K.vcf"));
inputFiles.put("Big Table",new File("/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/slowAnnotator/big.table.txt"));
inputFiles.put("Huge (1M) VCF",new File("1M.vcf"));
// the input types
inputTypes.put("Huge (1M) VCF", VCFCodec.class);
inputTypes.put("Medium (100K) VCF", VCFCodec.class);
inputTypes.put("1000 records VCF", VCFCodec.class);
inputTypes.put("Big Table", AnnotatorInputTableCodec.class);*/
}
@Test
public void emptyTest() {
// do nothing
}
//@Test
public void performanceTest() {
try {
writer = new PrintWriter(new FileWriter("testOutput_linear.txt"));
writer2 = new PrintWriter(new FileWriter("testOutput_tree.txt"));
} catch (IOException e) {
Assert.fail("Unable to open file testOutput.txt");
}
writer.println("name,index,createTime,seekTime,thousandPerThousand,record_count,index_size");
writer2.println("name,index,createTime,seekTime,thousandPerThousand,record_count,index_size");
for (String name : inputFiles.keySet()) {
System.err.println("running " + name + " with linear index");
printTestLine(name,true);
System.err.println("running " + name + " with tree index");
printTestLine(name,false);
}
writer.close();
writer2.close();
}
private void printTestLine(String name, boolean useLinear) {
PrintWriter wr = (useLinear) ? writer : writer2;
List<Long> values = performIndexTest(name,useLinear);
wr.print(name + "," + ((useLinear) ? "linear" : "tree"));
for (Long l : values) {
wr.print(",");
wr.print(l);
}
wr.println();
}
/**
* time various tasks using the specified index
* @param name the name to get
* @return a five-piece: the time to create the index, the time to seek to chromosome 1, and the time to process reading
* every other 1000 bases of chr1 (of the first 100M), the count of records seen in the last oepration, and the index size
*/
public List<Long> performIndexTest(String name, boolean useLinear) {
TribbleRMDTrackBuilder.useLinearIndex = useLinear;
deleteIndex(inputFiles.get(name));
// time creating the index
long createTime = System.currentTimeMillis();
Pair<BasicFeatureSource, SAMSequenceDictionary> pairing = builder.createFeatureReader(inputTypes.get(name),inputFiles.get(name));
createTime = System.currentTimeMillis() - createTime;
System.err.println("index creation took " + createTime);
// seek to chr1
long seekTo1 = seekToChr1(pairing);
// seek every 1000 bases in Chr1
long count = 0;
long thousandEveryThousand = System.currentTimeMillis();
try {
for (int x = 1; x < 1000000; x = x + 1000) {
//CloseableTribbleIterator<Feature> iter = pairing.first.query("chr1", x+(int)Math.floor(Math.random()*1000), x+1000); // query
CloseableTribbleIterator<Feature> iter = pairing.first.query("chr1", x, x+1000); // query
for (Feature feat : iter) {
count++;
}
}
} catch (IOException e) {
Assert.fail("Unable to load file for query!!");
}
thousandEveryThousand = System.currentTimeMillis() - thousandEveryThousand;
System.err.println("thousand every thousand (for first million) took " + thousandEveryThousand);
return Arrays.asList(createTime,seekTo1,thousandEveryThousand,count,new File(inputFiles.get(name) + ".idx").length());
}
private long seekToChr1(Pair<BasicFeatureSource, SAMSequenceDictionary> pairing) {
// time seeking to the first 1M bases of Chr1
long seekTo1 = System.currentTimeMillis();
try {
CloseableTribbleIterator iter = pairing.first.query("chr1",1,10000000); // query
} catch (IOException e) {
Assert.fail("Unable to load file for query!!");
}
seekTo1 = System.currentTimeMillis() - seekTo1;
System.err.println("seeking to chr1 took " + seekTo1);
return seekTo1;
}
private void deleteIndex(File fl) {
File indexFile = new File(fl + TribbleRMDTrackBuilder.indexExtension);
boolean deleted = true;
if (indexFile.exists())
deleted = indexFile.delete();
if (!deleted)
Assert.fail("Unable to delete index file");
}
}

View File

@ -73,8 +73,8 @@ public class TribbleRMDTrackBuilderUnitTest extends BaseTest {
Assert.fail("IO exception unexpected" + e.getMessage());
}
// make sure we didn't write the file (check that it's timestamp is within bounds)
//System.err.println(new File(vcfFile + TribbleRMDTrackBuilder.linearIndexExtension).lastModified());
Assert.assertTrue(Math.abs(1279591752000l - new File(vcfFile + TribbleRMDTrackBuilder.linearIndexExtension).lastModified()) < 100);
//System.err.println(new File(vcfFile + TribbleRMDTrackBuilder.indexExtension).lastModified());
Assert.assertTrue(Math.abs(1279591752000l - new File(vcfFile + TribbleRMDTrackBuilder.indexExtension).lastModified()) < 100);
}

View File

@ -76,7 +76,7 @@ public class VCFWriterUnitTest extends BaseTest {
counter++;
}
Assert.assertEquals(2,counter);
new File(fakeVCFFile + TribbleRMDTrackBuilder.linearIndexExtension).delete();
new File(fakeVCFFile + TribbleRMDTrackBuilder.indexExtension).delete();
fakeVCFFile.delete();
}
catch (IOException e ) {

View File

@ -1,3 +1,3 @@
<ivy-module version="1.0">
<info organisation="org.broad" module="tribble" revision="122M" status="integration" publication="201007191854200" />
<info organisation="org.broad" module="tribble" revision="124M" status="integration" publication="201007221854200" />
</ivy-module>