adding the ability in Tribble to create indexes from a stream of features, so that we can create multiple indexes from one pass of the file. In the GATK we now create multiple indexes, and choose the

most appropriate based on feature density, and the longest feature in the file.  Also:

- Converted Tribble to TestNG; it has better features and is about 6x faster.
- As much code clean-up as I could get done.  More to do, especially in the example code.
- Moved asserts in the code to throw exceptions.
- Added getBinSize to the index interface; both indexes already implemented this.
- Removed the abstract parts of the indexCreator interface; this is now more simple.
- Added an IndexType enumeration; might be overkill but it is at least a single point of entry for index information.



git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4082 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
aaron 2010-08-23 06:54:59 +00:00
parent 295472bf69
commit 2d3b6d89dc
2 changed files with 28 additions and 39 deletions

View File

@ -74,13 +74,11 @@ public class TribbleRMDTrackBuilder extends PluginManager<FeatureCodec> implemen
} }
/** @return a list of all available track types we currently have access to create */ /** @return a list of all available track types we currently have access to create */
@Override
public Map<String, Class> getAvailableTrackNamesAndTypes() { public Map<String, Class> getAvailableTrackNamesAndTypes() {
return new HashMap<String, Class>(this.pluginsByName); return new HashMap<String, Class>(this.pluginsByName);
} }
/** @return a list of all available track record types we currently have access to create */ /** @return a list of all available track record types we currently have access to create */
@Override
public Map<String, Class> getAvailableTrackNamesAndRecordTypes() { public Map<String, Class> getAvailableTrackNamesAndRecordTypes() {
Map<String, Class> classes = new HashMap<String, Class>(); Map<String, Class> classes = new HashMap<String, Class>();
for (String name: this.pluginsByName.keySet()) { for (String name: this.pluginsByName.keySet()) {
@ -101,7 +99,6 @@ public class TribbleRMDTrackBuilder extends PluginManager<FeatureCodec> implemen
* @throws RMDTrackCreationException * @throws RMDTrackCreationException
* if we don't know of the target class or we couldn't create it * if we don't know of the target class or we couldn't create it
*/ */
@Override
public RMDTrack createInstanceOfTrack(Class targetClass, String name, File inputFile) throws RMDTrackCreationException { public RMDTrack createInstanceOfTrack(Class targetClass, String name, File inputFile) throws RMDTrackCreationException {
// return a feature reader track // return a feature reader track
Pair<BasicFeatureSource, SAMSequenceDictionary> pair = createFeatureReader(targetClass, name, inputFile); Pair<BasicFeatureSource, SAMSequenceDictionary> pair = createFeatureReader(targetClass, name, inputFile);
@ -190,9 +187,7 @@ public class TribbleRMDTrackBuilder extends PluginManager<FeatureCodec> implemen
public synchronized static Index loadIndex(File inputFile, FeatureCodec codec, boolean onDisk) throws IOException { public synchronized static Index loadIndex(File inputFile, FeatureCodec codec, boolean onDisk) throws IOException {
// create the index file name, locking on the index file name // create the index file name, locking on the index file name
File indexFile = null; File indexFile = new File(inputFile.getAbsoluteFile() + indexExtension);
indexFile = new File(inputFile.getAbsoluteFile() + indexExtension);
FSLockWithShared lock = new FSLockWithShared(indexFile); FSLockWithShared lock = new FSLockWithShared(indexFile);
// acquire a lock on the file // acquire a lock on the file
@ -307,9 +302,7 @@ public class TribbleRMDTrackBuilder extends PluginManager<FeatureCodec> implemen
private static Index createIndexInMemory(File inputFile, FeatureCodec codec) throws IOException { private static Index createIndexInMemory(File inputFile, FeatureCodec codec) throws IOException {
// this can take a while, let them know what we're doing // this can take a while, let them know what we're doing
logger.info("Creating Tribble index in memory for file " + inputFile); logger.info("Creating Tribble index in memory for file " + inputFile);
IndexCreator creator; return IndexFactory.createIndex(inputFile, codec, IndexFactory.IndexBalanceApproach.FOR_SEEK_TIME);
creator = new LinearIndexCreator(inputFile, codec, null);
return creator.createIndex();
} }
/** /**

View File

@ -5,7 +5,6 @@ import net.sf.samtools.SAMSequenceDictionary;
import org.apache.log4j.Level; import org.apache.log4j.Level;
import org.apache.log4j.Logger; import org.apache.log4j.Logger;
import org.broad.tribble.Feature; import org.broad.tribble.Feature;
import org.broad.tribble.bed.BEDCodec;
import org.broad.tribble.index.Index; import org.broad.tribble.index.Index;
import org.broad.tribble.index.linear.LinearIndex; import org.broad.tribble.index.linear.LinearIndex;
import org.broad.tribble.iterators.CloseableTribbleIterator; import org.broad.tribble.iterators.CloseableTribbleIterator;
@ -45,7 +44,7 @@ public class IndexPerformanceTests extends BaseTest {
String fileLocation = validationDataLocation + "Index_Performance_Data/"; String fileLocation = validationDataLocation + "Index_Performance_Data/";
// bin sizes to try // bin sizes to try
int[] binSizes = {100, 1000, 5000, 16000}; int[] binSizes = {10, 100, 1000, 5000, 10000, 50000};
PrintWriter writer; PrintWriter writer;
PrintWriter writer2; PrintWriter writer2;
@ -57,20 +56,19 @@ public class IndexPerformanceTests extends BaseTest {
IndexedFastaSequenceFile seq = new IndexedFastaSequenceFile(new File(hg18Reference)); IndexedFastaSequenceFile seq = new IndexedFastaSequenceFile(new File(hg18Reference));
GenomeLocParser.setupRefContigOrdering(seq); GenomeLocParser.setupRefContigOrdering(seq);
int recordCount[] = {10,100,1000,10000,100000,500000,1000000};
int longestFeature[] = {1,50,100,1000,100000};
// the input files // the input files
for (int rCount : recordCount){ /*inputFiles.put("\"10\"",new File(fileLocation + "tip10.vcf"));
for (int longest : longestFeature) { inputFiles.put("\"100\"",new File(fileLocation + "tip100.vcf"));
inputFiles.put("./BED/" + "bed_density_" + rCount + "_fLengthMax_" + longest + ".BED",new File("./BED/" + "bed_density_" + rCount + "_fLengthMax_" + longest + ".BED")); inputFiles.put("\"1,000\"",new File(fileLocation + "tip1000.vcf"));
} inputFiles.put("\"10,000\"",new File(fileLocation + "tip10000.vcf"));
} inputFiles.put("\"100,000\"",new File(fileLocation + "tip100000.vcf"));
inputFiles.put("\"1,000,000\"",new File(fileLocation + "tip1000000.vcf"));*/
for (String name : inputFiles.keySet()) { for (String name : inputFiles.keySet()) {
inputTypes.put(name, BEDCodec.class); inputTypes.put(name,VCFCodec.class);
} }
inputFiles.put("Big Table",new File("/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/slowAnnotator/big.table.txt"));
inputTypes.put("Big Table", AnnotatorInputTableCodec.class);
} }
@Test @Test
@ -117,10 +115,7 @@ public class IndexPerformanceTests extends BaseTest {
* every other 1000 bases of chr1 (of the first 100M), the count of records seen in the last operation, and the index size * every other 1000 bases of chr1 (of the first 100M), the count of records seen in the last operation, and the index size
*/ */
public List<Long> performIndexTest(String name, boolean useLinear, int size) { public List<Long> performIndexTest(String name, boolean useLinear, int size) {
//TribbleRMDTrackBuilder.useLinearIndex = useLinear; deleteIndex(inputFiles.get(name));
//TribbleRMDTrackBuilder.binSize = size;
deleteIndex(new File(inputFiles.get(name) + ((useLinear) ? ".idx" : ".tdx")));
// time creating the index // time creating the index
long createTime = System.currentTimeMillis(); long createTime = System.currentTimeMillis();
Pair<BasicFeatureSource, SAMSequenceDictionary> pairing = builder.createFeatureReader(inputTypes.get(name),inputFiles.get(name)); Pair<BasicFeatureSource, SAMSequenceDictionary> pairing = builder.createFeatureReader(inputTypes.get(name),inputFiles.get(name));
@ -137,7 +132,7 @@ public class IndexPerformanceTests extends BaseTest {
for (int x = 1; x < 1000000; x = x + 1000) { for (int x = 1; x < 1000000; x = x + 1000) {
//CloseableTribbleIterator<Feature> iter = pairing.first.query("chr1", x+(int)Math.floor(Math.random()*1000), x+1000); // query //CloseableTribbleIterator<Feature> iter = pairing.first.query("chr1", x+(int)Math.floor(Math.random()*1000), x+1000); // query
CloseableTribbleIterator<Feature> iter = pairing.first.query("chr1", x, x+1000); // query CloseableTribbleIterator<Feature> iter = pairing.first.query("chr1", x, x+1000); // query
while (iter.hasNext() && iter.next().getStart() < x) { for (Feature feat : iter) {
count++; count++;
} }
} }
@ -185,11 +180,11 @@ public class IndexPerformanceTests extends BaseTest {
} }
else if (features2.get(entry.getKey()) != entry.getValue()) { else if (features2.get(entry.getKey()) != entry.getValue()) {
/*System.err.println("counts are not equal at " + System.err.println("counts are not equal at " +
entry.getKey() + entry.getKey() +
" features2.get(entry.getKey()) = " + " features2.get(entry.getKey()) = " +
features2.get(entry.getKey()) + features2.get(entry.getKey()) +
" feature1 = " + entry.getValue());*/ " feature1 = " + entry.getValue());
} }
if (features2.containsKey(entry.getKey())) features2.remove(entry.getKey()); if (features2.containsKey(entry.getKey())) features2.remove(entry.getKey());
} }
@ -207,17 +202,16 @@ public class IndexPerformanceTests extends BaseTest {
private Map<Integer,Integer> getMapOfFeatures(Map<GenomeLoc, Integer> features, boolean useLinear) { private Map<Integer,Integer> getMapOfFeatures(Map<GenomeLoc, Integer> features, boolean useLinear) {
File bigTable = new File("/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/slowAnnotator/big.table.txt"); File bigTable = new File("/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/slowAnnotator/big.table.txt");
//TribbleRMDTrackBuilder.useLinearIndex = useLinear;
//TribbleRMDTrackBuilder.binSize = 1000;
deleteIndex(inputFiles.get("Big Table")); deleteIndex(inputFiles.get("Big Table"));
// time creating the index // time creating the index
logger.warn("creating index"); logger.warn("creating index, linear = " + useLinear);
Map<Integer,Integer> bucketToCount = new TreeMap<Integer,Integer>(); Map<Integer,Integer> bucketToCount = new TreeMap<Integer,Integer>();
Pair<BasicFeatureSource, SAMSequenceDictionary> pairing = builder.createFeatureReader(inputTypes.get("Big Table"),inputFiles.get("Big Table")); Pair<BasicFeatureSource, SAMSequenceDictionary> pairing = builder.createFeatureReader(inputTypes.get("Big Table"),inputFiles.get("Big Table"));
logger.warn("created index, traversing");
try { try {
for (int x = 5000; x < 6000; x = x + 1000) { for (Integer x = 5000; x < 6000; x = x + 1000) {
int bucketCount = 0; int bucketCount = 0;
CloseableTribbleIterator<Feature> iter = pairing.first.query("chr1", x, x+1000); // query CloseableTribbleIterator<Feature> iter = pairing.first.query("chr1", x, x+1000); // query
for (Feature feat : iter) { for (Feature feat : iter) {
@ -227,10 +221,12 @@ public class IndexPerformanceTests extends BaseTest {
if (features.containsKey(loc)) if (features.containsKey(loc))
count = features.get(loc)+1; count = features.get(loc)+1;
features.put(loc,count); features.put(loc,count);
bucketCount++; if (bucketToCount.containsKey(x)) bucketToCount.put(x,bucketToCount.get(x)+1);
else bucketToCount.put(x,1);
} }
bucketToCount.put(x,bucketCount); //bucketToCount.put(x,bucketCount);
} }
logger.warn("Done, returning");
} catch (IOException e) { } catch (IOException e) {
Assert.fail("Unable to load file for query!!"); Assert.fail("Unable to load file for query!!");
} }
@ -240,8 +236,6 @@ public class IndexPerformanceTests extends BaseTest {
//@Test //@Test
public void testGetTreeIndexLocation() { public void testGetTreeIndexLocation() {
File bigTable = new File("small.table.txt"); File bigTable = new File("small.table.txt");
//TribbleRMDTrackBuilder.useLinearIndex = false;
//TribbleRMDTrackBuilder.binSize = 1000;
deleteIndex(bigTable); deleteIndex(bigTable);
// time creating the index // time creating the index
@ -249,6 +243,7 @@ public class IndexPerformanceTests extends BaseTest {
Map<Integer,Integer> bucketToCount = new TreeMap<Integer,Integer>(); Map<Integer,Integer> bucketToCount = new TreeMap<Integer,Integer>();
Pair<BasicFeatureSource, SAMSequenceDictionary> pairing = builder.createFeatureReader(inputTypes.get("Big Table"),bigTable); Pair<BasicFeatureSource, SAMSequenceDictionary> pairing = builder.createFeatureReader(inputTypes.get("Big Table"),bigTable);
logger.warn("created index, traversing");
try { try {
int count= 0; int count= 0;
CloseableTribbleIterator<Feature> iter = null; CloseableTribbleIterator<Feature> iter = null;
@ -258,6 +253,7 @@ public class IndexPerformanceTests extends BaseTest {
GenomeLoc loc = GenomeLocParser.createGenomeLoc(feat.getChr(),feat.getStart(),feat.getEnd()); GenomeLoc loc = GenomeLocParser.createGenomeLoc(feat.getChr(),feat.getStart(),feat.getEnd());
if (loc.getStop() < 5000 || loc.getStart() > 6000) continue; if (loc.getStop() < 5000 || loc.getStart() > 6000) continue;
count++; count++;
System.err.println(feat.toString());
} }
System.err.println(count); System.err.println(count);
} catch (IOException e) { } catch (IOException e) {
@ -267,10 +263,10 @@ public class IndexPerformanceTests extends BaseTest {
private void deleteIndex(File fl) { private void deleteIndex(File fl) {
System.err.println("Trying to delete index " + fl); File indexFile = new File(fl + TribbleRMDTrackBuilder.indexExtension);
boolean deleted = true; boolean deleted = true;
if (fl.exists()) if (indexFile.exists())
deleted = fl.delete(); deleted = indexFile.delete();
if (!deleted) if (!deleted)
Assert.fail("Unable to delete index file"); Assert.fail("Unable to delete index file");
} }