fixes for the Tree index, and some small clean-up in the GATK.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3991 348d0f76-0448-11de-a6fe-93d51630548a
2010-08-09 20:41:50 +00:00 · 2010-08-09 20:41:50 +00:00 · 0f29f2ae3f
parent 3eee3183fd
commit 0f29f2ae3f
10 changed files with 93 additions and 63 deletions
--- a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java
+++ b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java
@ -169,11 +169,10 @@ public class GenomeAnalysisEngine {
        ShardStrategy shardStrategy = getShardStrategy(my_walker,
                microScheduler.getReference(),
                intervals,
-                argCollection.maximumEngineIterations,
                readsDataSource != null ? readsDataSource.getReadsInfo().getValidationExclusionList() : null);

        // execute the microscheduler, storing the results
-        Object result =  microScheduler.execute(my_walker, shardStrategy, argCollection.maximumEngineIterations);
+        Object result =  microScheduler.execute(my_walker, shardStrategy);

        //monitor.stop();
        //logger.info(String.format("Maximum heap size consumed: %d",monitor.getMaxMemoryUsed()));
@ -719,13 +718,11 @@ public class GenomeAnalysisEngine {
     * @param walker            Walker for which to infer sharding strategy.
     * @param drivingDataSource Data on which to shard.
     * @param intervals         Intervals to use when limiting sharding.
-     * @param maxIterations     the maximum number of iterations to run through
     * @return Sharding strategy for this driving data source.
     */
    protected ShardStrategy getShardStrategy(Walker walker,
                                             ReferenceSequenceFile drivingDataSource,
                                             GenomeLocSortedSet intervals,
-                                             Integer maxIterations,
                                             ValidationExclusion exclusions) {
        // Use monolithic sharding if no index is present.  Monolithic sharding is always required for the original
        // sharding system; it's required with the new sharding system only for locus walkers.
@ -775,13 +772,13 @@ public class GenomeAnalysisEngine {
                        ShardStrategyFactory.SHATTER_STRATEGY.LOCUS_EXPERIMENTAL,
                        drivingDataSource.getSequenceDictionary(),
                        SHARD_SIZE,
-                        intervals, maxIterations);
+                        intervals);
            } else
                shardStrategy = ShardStrategyFactory.shatter(readsDataSource,
                        referenceDataSource.getReference(),
                        ShardStrategyFactory.SHATTER_STRATEGY.LOCUS_EXPERIMENTAL,
                        drivingDataSource.getSequenceDictionary(),
-                        SHARD_SIZE, maxIterations);
+                        SHARD_SIZE);
        } else if (walker instanceof ReadWalker ||
                walker instanceof DuplicateWalker) {
            shardType = ShardStrategyFactory.SHATTER_STRATEGY.READS_EXPERIMENTAL;
@ -792,13 +789,13 @@ public class GenomeAnalysisEngine {
                        shardType,
                        drivingDataSource.getSequenceDictionary(),
                        SHARD_SIZE,
-                        intervals, maxIterations);
+                        intervals);
            } else {
                shardStrategy = ShardStrategyFactory.shatter(readsDataSource,
                        referenceDataSource.getReference(),
                        shardType,
                        drivingDataSource.getSequenceDictionary(),
-                        SHARD_SIZE, maxIterations);
+                        SHARD_SIZE);
            }
        } else if (walker instanceof ReadPairWalker) {
            if(readsDataSource != null && readsDataSource.getSortOrder() != SAMFileHeader.SortOrder.queryname)
@ -810,7 +807,7 @@ public class GenomeAnalysisEngine {
                    referenceDataSource.getReference(),
                    ShardStrategyFactory.SHATTER_STRATEGY.READS_EXPERIMENTAL,
                    drivingDataSource.getSequenceDictionary(),
-                    SHARD_SIZE, maxIterations);
+                    SHARD_SIZE);
        } else
            throw new StingException("Unable to support walker of type" + walker.getClass().getName());

--- a/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java
+++ b/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java
@ -128,10 +128,6 @@ public class GATKArgumentCollection {
    @Output(fullName = "outerr", shortName = "oe", doc = "A joint file for 'normal' and error output presented to the walker.  Will overwrite contents if file exists.", required = false)
    public String outErrFileName = null;

-    @Element(required = false)
-    @Argument(fullName = "maximum_iterations", shortName = "M", doc = "Maximum number of iterations to process before exiting, the lower bound is zero.  Intended only for testing", required = false)
-    public Integer maximumEngineIterations = -1;
-
    @Element(required = false)
    @Argument(fullName = "filterZeroMappingQualityReads", shortName = "fmq0", doc = "If true, mapping quality zero reads will be filtered at the lowest GATK level.  Vastly improves performance at areas with abnormal depth due to mapping Q0 reads", required = false)
    public Boolean filterZeroMappingQualityReads = false;
@ -294,9 +290,6 @@ public class GATKArgumentCollection {
        if (!(other.readBufferSize == null && this.readBufferSize == null) && (other.readBufferSize == null || this.readBufferSize == null)) {
            return false;
        }
-        if (!other.maximumEngineIterations.equals(this.maximumEngineIterations)) {
-            return false;
-        }
        if (!other.strictnessLevel.equals(this.strictnessLevel)) {
            return false;
        }
--- a/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java
+++ b/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java
@ -113,15 +113,11 @@ public class HierarchicalMicroScheduler extends MicroScheduler implements Hierar
        }
    }

-    public Object execute( Walker walker, ShardStrategy shardStrategy, int maxIterations ) {
+    public Object execute( Walker walker, ShardStrategy shardStrategy ) {
        // Fast fail for walkers not supporting TreeReducible interface.
        if (!( walker instanceof TreeReducible ))
            throw new IllegalArgumentException("The GATK can currently run in parallel only with TreeReducible walkers");

-        // Having maxiterations in the execute method is a holdover from the old TraversalEngine days.
-        // Lets do something else with this.
-        traversalEngine.setMaximumIterations(maxIterations);
-
        ReduceTree reduceTree = new ReduceTree(this);

        initializeWalker(walker);
--- a/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java
+++ b/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java
@ -47,11 +47,7 @@ public class LinearMicroScheduler extends MicroScheduler {
     * @param walker    Computation to perform over dataset.
     * @param shardStrategy A strategy for sharding the data.
     */
-    public Object execute(Walker walker, ShardStrategy shardStrategy, int maxIterations) {
-        // Having maxiterations in the execute method is a holdover from the old TraversalEngine days.
-        // Lets do something else with this.
-        traversalEngine.setMaximumIterations(maxIterations);
-
+    public Object execute(Walker walker, ShardStrategy shardStrategy) {
        walker.initialize();
        Accumulator accumulator = Accumulator.create(engine,walker);

--- a/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java
+++ b/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java
@ -133,7 +133,7 @@ public abstract class MicroScheduler {
     *
     * @return the return type of the walker
     */
-    public abstract Object execute(Walker walker, ShardStrategy shardStrategy, int iterations );
+    public abstract Object execute(Walker walker, ShardStrategy shardStrategy);

    /**
     * Retrieves the object responsible for tracking and managing output.
--- a/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java
+++ b/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java
@ -24,20 +24,9 @@ public abstract class TraversalEngine<M,T,WalkerType extends Walker<M,T>,Provide
    private final long MAX_PROGRESS_PRINT_TIME = 30 * 1000;        // 10 seconds in millisecs
    private final long N_RECORDS_TO_PRINT = 1000000;

-    // Maximum number of reads to process before finishing
-    protected long maximumIterations = -1;
-
    /** our log, which we want to capture anything from this class */
    protected static Logger logger = Logger.getLogger(TraversalEngine.class);

-    /**
-     * set the max number of iterations
-     * @param maximumIterations the number of iterations
-     */
-    public void setMaximumIterations(final int maximumIterations) {
-        this.maximumIterations = maximumIterations;
-    }
-
    /**
     * @param curTime (current runtime, in millisecs)
     *
--- a/java/src/org/broadinstitute/sting/gatk/traversals/TraverseDuplicates.java
+++ b/java/src/org/broadinstitute/sting/gatk/traversals/TraverseDuplicates.java
@ -196,11 +196,6 @@ public class TraverseDuplicates<M,T> extends TraversalEngine<M,T,DuplicateWalker
            }

            printProgress(DUPS_STRING, site);
-
-            if (this.maximumIterations > 0 && TraversalStatistics.nRecords > this.maximumIterations) {
-                logger.warn(String.format(("Maximum number of duplicate sets encountered, terminating traversal " + TraversalStatistics.nRecords)));
-                break;
-            }
        }

        return sum;
--- a/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLoci.java
+++ b/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLoci.java
@ -76,11 +76,6 @@ public class TraverseLoci<M,T> extends TraversalEngine<M,T,LocusWalker<M,T>,Locu
                    sum = walker.reduce(x, sum);
                }

-                if (this.maximumIterations > 0 && TraversalStatistics.nRecords > this.maximumIterations) {
-                    logger.warn(String.format("Maximum number of reads encountered, terminating traversal " + TraversalStatistics.nRecords));
-                    break;
-                }
-
                printProgress(LOCI_STRING, locus.getLocation());
            }
        }
--- a/java/test/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollectionUnitTest.java
+++ b/java/test/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollectionUnitTest.java
@ -80,7 +80,6 @@ public class GATKArgumentCollectionUnitTest extends BaseTest {
        List<File> input = new ArrayList<File>();
        input.add(new File("test.file"));
        collect.samFiles = input;
-        collect.maximumEngineIterations = -1;
        collect.strictnessLevel = SAMFileReader.ValidationStringency.STRICT;
        collect.referenceFile = new File("referenceFile".toLowerCase());
        collect.DBSNPFile = "DBSNPFile".toLowerCase();
--- a/java/test/org/broadinstitute/sting/gatk/refdata/tracks/builders/IndexPerformanceTests.java
+++ b/java/test/org/broadinstitute/sting/gatk/refdata/tracks/builders/IndexPerformanceTests.java
@ -14,6 +14,7 @@ import org.broad.tribble.vcf.VCFCodec;
 import org.broadinstitute.sting.BaseTest;
 import org.broadinstitute.sting.gatk.refdata.features.annotator.AnnotatorInputTableCodec;
 import org.broadinstitute.sting.gatk.refdata.features.annotator.AnnotatorInputTableFeature;
+import org.broadinstitute.sting.utils.GenomeLoc;
 import org.broadinstitute.sting.utils.GenomeLocParser;
 import org.broadinstitute.sting.utils.collections.Pair;
 import org.junit.Assert;
@ -162,35 +163,104 @@ public class IndexPerformanceTests extends BaseTest {

    //@Test
    public void testBigTable() {
+        // store the mapping of location -> variant; this will get big
+        Map<GenomeLoc, Integer> features = new TreeMap<GenomeLoc,Integer>();
+
+        Map<Integer,Integer> bucketToCount = getMapOfFeatures(features,false);
+        Pair<BasicFeatureSource, SAMSequenceDictionary> pairing;
+
+        Map<GenomeLoc, Integer> features2 = new TreeMap<GenomeLoc,Integer>();
+        Map<Integer,Integer> bucketToCount2 = getMapOfFeatures(features2,true);
+
+        System.err.println("Summary: ");
+        System.err.println("Summary: tree " + features.size());
+        System.err.println("Summary: linear " + features2.size());
+
+        // compare the two
+        for (Map.Entry<GenomeLoc,Integer> entry: features.entrySet()) {
+            if (!features2.containsKey(entry.getKey())) {
+                System.err.println("key " + entry + " missing from linear, count " + entry.getValue());
+
+            }
+            else if (features2.get(entry.getKey()) != entry.getValue()) {
+                /*System.err.println("counts are not equal at " +
+                        entry.getKey() +
+                        " features2.get(entry.getKey()) = " +
+                        features2.get(entry.getKey()) +
+                        " feature1 = " + entry.getValue());*/
+            }
+            if (features2.containsKey(entry.getKey())) features2.remove(entry.getKey());
+        }
+        System.err.println("Missing from the tree :");
+        for (Map.Entry<GenomeLoc,Integer> entry2: features2.entrySet()) {
+            System.err.println("Position " + entry2.getKey() + " count = " + entry2.getValue());
+        }
+
+        for (Integer bucket : bucketToCount.keySet()) {
+            if (!bucketToCount2.get(bucket).equals(bucketToCount.get(bucket))) {
+                System.err.println("Bucket " + bucket + " tree != linear, " + bucketToCount2.get(bucket) + " " + bucketToCount.get(bucket));
+            }
+        }
+    }
+
+    private Map<Integer,Integer> getMapOfFeatures(Map<GenomeLoc, Integer> features, boolean useLinear) {
        File bigTable = new File("/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/slowAnnotator/big.table.txt");
-        TribbleRMDTrackBuilder.useLinearIndex = false;
+        TribbleRMDTrackBuilder.useLinearIndex = useLinear;
        TribbleRMDTrackBuilder.binSize = 1000;

        deleteIndex(inputFiles.get("Big Table"));
        // time creating the index
        logger.warn("creating index");
-        long createTime = System.currentTimeMillis();
+
+        Map<Integer,Integer> bucketToCount = new TreeMap<Integer,Integer>();
        Pair<BasicFeatureSource, SAMSequenceDictionary> pairing = builder.createFeatureReader(inputTypes.get("Big Table"),inputFiles.get("Big Table"));
-        createTime = System.currentTimeMillis() - createTime;
-        //System.err.println("index creation took " + createTime);
-        PrintWriter stream = null;
-        logger.warn("reading and writing");
        try {
-            stream = new PrintWriter(new File("bigTable.out.tree"));
-        } catch (FileNotFoundException e) {
-            Assert.fail("Fail!!!");
-        }
-        try {
-            for (int x = 1; x < 200000; x = x + 1000) {
+            for (int x = 5000; x < 6000; x = x + 1000) {
+                int bucketCount = 0;
                CloseableTribbleIterator<Feature> iter = pairing.first.query("chr1", x, x+1000); // query
                for (Feature feat : iter) {
-                    stream.println(((AnnotatorInputTableFeature)feat).toString());
+                    GenomeLoc loc = GenomeLocParser.createGenomeLoc(feat.getChr(),feat.getStart(),feat.getEnd());
+                    if (loc.getStop() < 5000 || loc.getStart() > 6000) continue;
+                    int count = 0;
+                    if (features.containsKey(loc))
+                        count = features.get(loc)+1;
+                    features.put(loc,count);
+                    bucketCount++;
                }
+                bucketToCount.put(x,bucketCount);
            }
        } catch (IOException e) {
            Assert.fail("Unable to load file for query!!");
        }
-        stream.close();
+        return bucketToCount;
+    }
+
+    //@Test
+    public void testGetTreeIndexLocation() {
+        File bigTable = new File("small.table.txt");
+        TribbleRMDTrackBuilder.useLinearIndex = false;
+        TribbleRMDTrackBuilder.binSize = 1000;
+
+        deleteIndex(bigTable);
+        // time creating the index
+        logger.warn("creating index");
+
+        Map<Integer,Integer> bucketToCount = new TreeMap<Integer,Integer>();
+        Pair<BasicFeatureSource, SAMSequenceDictionary> pairing = builder.createFeatureReader(inputTypes.get("Big Table"),bigTable);
+        try {
+            int count= 0;
+            CloseableTribbleIterator<Feature> iter = null;
+            for (int x = 5000; x < 6000; x = x + 1000)
+                iter = pairing.first.query("chr1", x, x+1000); // query
+                for (Feature feat : iter) {
+                    GenomeLoc loc = GenomeLocParser.createGenomeLoc(feat.getChr(),feat.getStart(),feat.getEnd());
+                    if (loc.getStop() < 5000 || loc.getStart() > 6000) continue;
+                    count++;
+                }
+        System.err.println(count);
+        } catch (IOException e) {
+            Assert.fail("Unable to load file for query!!");
+        }
    }