diff --git a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index 4af224752..6d5ee4d7a 100755 --- a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -169,11 +169,10 @@ public class GenomeAnalysisEngine { ShardStrategy shardStrategy = getShardStrategy(my_walker, microScheduler.getReference(), intervals, - argCollection.maximumEngineIterations, readsDataSource != null ? readsDataSource.getReadsInfo().getValidationExclusionList() : null); // execute the microscheduler, storing the results - Object result = microScheduler.execute(my_walker, shardStrategy, argCollection.maximumEngineIterations); + Object result = microScheduler.execute(my_walker, shardStrategy); //monitor.stop(); //logger.info(String.format("Maximum heap size consumed: %d",monitor.getMaxMemoryUsed())); @@ -719,13 +718,11 @@ public class GenomeAnalysisEngine { * @param walker Walker for which to infer sharding strategy. * @param drivingDataSource Data on which to shard. * @param intervals Intervals to use when limiting sharding. - * @param maxIterations the maximum number of iterations to run through * @return Sharding strategy for this driving data source. */ protected ShardStrategy getShardStrategy(Walker walker, ReferenceSequenceFile drivingDataSource, GenomeLocSortedSet intervals, - Integer maxIterations, ValidationExclusion exclusions) { // Use monolithic sharding if no index is present. Monolithic sharding is always required for the original // sharding system; it's required with the new sharding system only for locus walkers. @@ -775,13 +772,13 @@ public class GenomeAnalysisEngine { ShardStrategyFactory.SHATTER_STRATEGY.LOCUS_EXPERIMENTAL, drivingDataSource.getSequenceDictionary(), SHARD_SIZE, - intervals, maxIterations); + intervals); } else shardStrategy = ShardStrategyFactory.shatter(readsDataSource, referenceDataSource.getReference(), ShardStrategyFactory.SHATTER_STRATEGY.LOCUS_EXPERIMENTAL, drivingDataSource.getSequenceDictionary(), - SHARD_SIZE, maxIterations); + SHARD_SIZE); } else if (walker instanceof ReadWalker || walker instanceof DuplicateWalker) { shardType = ShardStrategyFactory.SHATTER_STRATEGY.READS_EXPERIMENTAL; @@ -792,13 +789,13 @@ public class GenomeAnalysisEngine { shardType, drivingDataSource.getSequenceDictionary(), SHARD_SIZE, - intervals, maxIterations); + intervals); } else { shardStrategy = ShardStrategyFactory.shatter(readsDataSource, referenceDataSource.getReference(), shardType, drivingDataSource.getSequenceDictionary(), - SHARD_SIZE, maxIterations); + SHARD_SIZE); } } else if (walker instanceof ReadPairWalker) { if(readsDataSource != null && readsDataSource.getSortOrder() != SAMFileHeader.SortOrder.queryname) @@ -810,7 +807,7 @@ public class GenomeAnalysisEngine { referenceDataSource.getReference(), ShardStrategyFactory.SHATTER_STRATEGY.READS_EXPERIMENTAL, drivingDataSource.getSequenceDictionary(), - SHARD_SIZE, maxIterations); + SHARD_SIZE); } else throw new StingException("Unable to support walker of type" + walker.getClass().getName()); diff --git a/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java b/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java index 3be32ec49..f6143c5ac 100755 --- a/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java +++ b/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java @@ -128,10 +128,6 @@ public class GATKArgumentCollection { @Output(fullName = "outerr", shortName = "oe", doc = "A joint file for 'normal' and error output presented to the walker. Will overwrite contents if file exists.", required = false) public String outErrFileName = null; - @Element(required = false) - @Argument(fullName = "maximum_iterations", shortName = "M", doc = "Maximum number of iterations to process before exiting, the lower bound is zero. Intended only for testing", required = false) - public Integer maximumEngineIterations = -1; - @Element(required = false) @Argument(fullName = "filterZeroMappingQualityReads", shortName = "fmq0", doc = "If true, mapping quality zero reads will be filtered at the lowest GATK level. Vastly improves performance at areas with abnormal depth due to mapping Q0 reads", required = false) public Boolean filterZeroMappingQualityReads = false; @@ -294,9 +290,6 @@ public class GATKArgumentCollection { if (!(other.readBufferSize == null && this.readBufferSize == null) && (other.readBufferSize == null || this.readBufferSize == null)) { return false; } - if (!other.maximumEngineIterations.equals(this.maximumEngineIterations)) { - return false; - } if (!other.strictnessLevel.equals(this.strictnessLevel)) { return false; } diff --git a/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java b/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java index 543d2f6d9..f3faf9a52 100755 --- a/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java +++ b/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java @@ -113,15 +113,11 @@ public class HierarchicalMicroScheduler extends MicroScheduler implements Hierar } } - public Object execute( Walker walker, ShardStrategy shardStrategy, int maxIterations ) { + public Object execute( Walker walker, ShardStrategy shardStrategy ) { // Fast fail for walkers not supporting TreeReducible interface. if (!( walker instanceof TreeReducible )) throw new IllegalArgumentException("The GATK can currently run in parallel only with TreeReducible walkers"); - // Having maxiterations in the execute method is a holdover from the old TraversalEngine days. - // Lets do something else with this. - traversalEngine.setMaximumIterations(maxIterations); - ReduceTree reduceTree = new ReduceTree(this); initializeWalker(walker); diff --git a/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java b/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java index 7da467d16..9acebe6a4 100644 --- a/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java +++ b/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java @@ -47,11 +47,7 @@ public class LinearMicroScheduler extends MicroScheduler { * @param walker Computation to perform over dataset. * @param shardStrategy A strategy for sharding the data. */ - public Object execute(Walker walker, ShardStrategy shardStrategy, int maxIterations) { - // Having maxiterations in the execute method is a holdover from the old TraversalEngine days. - // Lets do something else with this. - traversalEngine.setMaximumIterations(maxIterations); - + public Object execute(Walker walker, ShardStrategy shardStrategy) { walker.initialize(); Accumulator accumulator = Accumulator.create(engine,walker); diff --git a/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java b/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java index 603696616..ff6712481 100755 --- a/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java +++ b/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java @@ -133,7 +133,7 @@ public abstract class MicroScheduler { * * @return the return type of the walker */ - public abstract Object execute(Walker walker, ShardStrategy shardStrategy, int iterations ); + public abstract Object execute(Walker walker, ShardStrategy shardStrategy); /** * Retrieves the object responsible for tracking and managing output. diff --git a/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java b/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java index b84f94d5b..49c7b883d 100755 --- a/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java +++ b/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java @@ -24,20 +24,9 @@ public abstract class TraversalEngine,Provide private final long MAX_PROGRESS_PRINT_TIME = 30 * 1000; // 10 seconds in millisecs private final long N_RECORDS_TO_PRINT = 1000000; - // Maximum number of reads to process before finishing - protected long maximumIterations = -1; - /** our log, which we want to capture anything from this class */ protected static Logger logger = Logger.getLogger(TraversalEngine.class); - /** - * set the max number of iterations - * @param maximumIterations the number of iterations - */ - public void setMaximumIterations(final int maximumIterations) { - this.maximumIterations = maximumIterations; - } - /** * @param curTime (current runtime, in millisecs) * diff --git a/java/src/org/broadinstitute/sting/gatk/traversals/TraverseDuplicates.java b/java/src/org/broadinstitute/sting/gatk/traversals/TraverseDuplicates.java index 199daf911..e535e1707 100755 --- a/java/src/org/broadinstitute/sting/gatk/traversals/TraverseDuplicates.java +++ b/java/src/org/broadinstitute/sting/gatk/traversals/TraverseDuplicates.java @@ -196,11 +196,6 @@ public class TraverseDuplicates extends TraversalEngine 0 && TraversalStatistics.nRecords > this.maximumIterations) { - logger.warn(String.format(("Maximum number of duplicate sets encountered, terminating traversal " + TraversalStatistics.nRecords))); - break; - } } return sum; diff --git a/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLoci.java b/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLoci.java index e6e0d1627..e58ae13d3 100755 --- a/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLoci.java +++ b/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLoci.java @@ -76,11 +76,6 @@ public class TraverseLoci extends TraversalEngine,Locu sum = walker.reduce(x, sum); } - if (this.maximumIterations > 0 && TraversalStatistics.nRecords > this.maximumIterations) { - logger.warn(String.format("Maximum number of reads encountered, terminating traversal " + TraversalStatistics.nRecords)); - break; - } - printProgress(LOCI_STRING, locus.getLocation()); } } diff --git a/java/test/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollectionUnitTest.java b/java/test/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollectionUnitTest.java index f98175004..a9134876b 100755 --- a/java/test/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollectionUnitTest.java +++ b/java/test/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollectionUnitTest.java @@ -80,7 +80,6 @@ public class GATKArgumentCollectionUnitTest extends BaseTest { List input = new ArrayList(); input.add(new File("test.file")); collect.samFiles = input; - collect.maximumEngineIterations = -1; collect.strictnessLevel = SAMFileReader.ValidationStringency.STRICT; collect.referenceFile = new File("referenceFile".toLowerCase()); collect.DBSNPFile = "DBSNPFile".toLowerCase(); diff --git a/java/test/org/broadinstitute/sting/gatk/refdata/tracks/builders/IndexPerformanceTests.java b/java/test/org/broadinstitute/sting/gatk/refdata/tracks/builders/IndexPerformanceTests.java index 05e05ec0a..0f1e1f70c 100644 --- a/java/test/org/broadinstitute/sting/gatk/refdata/tracks/builders/IndexPerformanceTests.java +++ b/java/test/org/broadinstitute/sting/gatk/refdata/tracks/builders/IndexPerformanceTests.java @@ -14,6 +14,7 @@ import org.broad.tribble.vcf.VCFCodec; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.gatk.refdata.features.annotator.AnnotatorInputTableCodec; import org.broadinstitute.sting.gatk.refdata.features.annotator.AnnotatorInputTableFeature; +import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.collections.Pair; import org.junit.Assert; @@ -162,35 +163,104 @@ public class IndexPerformanceTests extends BaseTest { //@Test public void testBigTable() { + // store the mapping of location -> variant; this will get big + Map features = new TreeMap(); + + Map bucketToCount = getMapOfFeatures(features,false); + Pair pairing; + + Map features2 = new TreeMap(); + Map bucketToCount2 = getMapOfFeatures(features2,true); + + System.err.println("Summary: "); + System.err.println("Summary: tree " + features.size()); + System.err.println("Summary: linear " + features2.size()); + + // compare the two + for (Map.Entry entry: features.entrySet()) { + if (!features2.containsKey(entry.getKey())) { + System.err.println("key " + entry + " missing from linear, count " + entry.getValue()); + + } + else if (features2.get(entry.getKey()) != entry.getValue()) { + /*System.err.println("counts are not equal at " + + entry.getKey() + + " features2.get(entry.getKey()) = " + + features2.get(entry.getKey()) + + " feature1 = " + entry.getValue());*/ + } + if (features2.containsKey(entry.getKey())) features2.remove(entry.getKey()); + } + System.err.println("Missing from the tree :"); + for (Map.Entry entry2: features2.entrySet()) { + System.err.println("Position " + entry2.getKey() + " count = " + entry2.getValue()); + } + + for (Integer bucket : bucketToCount.keySet()) { + if (!bucketToCount2.get(bucket).equals(bucketToCount.get(bucket))) { + System.err.println("Bucket " + bucket + " tree != linear, " + bucketToCount2.get(bucket) + " " + bucketToCount.get(bucket)); + } + } + } + + private Map getMapOfFeatures(Map features, boolean useLinear) { File bigTable = new File("/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/slowAnnotator/big.table.txt"); - TribbleRMDTrackBuilder.useLinearIndex = false; + TribbleRMDTrackBuilder.useLinearIndex = useLinear; TribbleRMDTrackBuilder.binSize = 1000; deleteIndex(inputFiles.get("Big Table")); // time creating the index logger.warn("creating index"); - long createTime = System.currentTimeMillis(); + + Map bucketToCount = new TreeMap(); Pair pairing = builder.createFeatureReader(inputTypes.get("Big Table"),inputFiles.get("Big Table")); - createTime = System.currentTimeMillis() - createTime; - //System.err.println("index creation took " + createTime); - PrintWriter stream = null; - logger.warn("reading and writing"); try { - stream = new PrintWriter(new File("bigTable.out.tree")); - } catch (FileNotFoundException e) { - Assert.fail("Fail!!!"); - } - try { - for (int x = 1; x < 200000; x = x + 1000) { + for (int x = 5000; x < 6000; x = x + 1000) { + int bucketCount = 0; CloseableTribbleIterator iter = pairing.first.query("chr1", x, x+1000); // query for (Feature feat : iter) { - stream.println(((AnnotatorInputTableFeature)feat).toString()); + GenomeLoc loc = GenomeLocParser.createGenomeLoc(feat.getChr(),feat.getStart(),feat.getEnd()); + if (loc.getStop() < 5000 || loc.getStart() > 6000) continue; + int count = 0; + if (features.containsKey(loc)) + count = features.get(loc)+1; + features.put(loc,count); + bucketCount++; } + bucketToCount.put(x,bucketCount); } } catch (IOException e) { Assert.fail("Unable to load file for query!!"); } - stream.close(); + return bucketToCount; + } + + //@Test + public void testGetTreeIndexLocation() { + File bigTable = new File("small.table.txt"); + TribbleRMDTrackBuilder.useLinearIndex = false; + TribbleRMDTrackBuilder.binSize = 1000; + + deleteIndex(bigTable); + // time creating the index + logger.warn("creating index"); + + Map bucketToCount = new TreeMap(); + Pair pairing = builder.createFeatureReader(inputTypes.get("Big Table"),bigTable); + try { + int count= 0; + CloseableTribbleIterator iter = null; + for (int x = 5000; x < 6000; x = x + 1000) + iter = pairing.first.query("chr1", x, x+1000); // query + for (Feature feat : iter) { + GenomeLoc loc = GenomeLocParser.createGenomeLoc(feat.getChr(),feat.getStart(),feat.getEnd()); + if (loc.getStop() < 5000 || loc.getStart() > 6000) continue; + count++; + } + System.err.println(count); + } catch (IOException e) { + Assert.fail("Unable to load file for query!!"); + } }