From 4faa680887875a0239acd69901fcf5d557d052a6 Mon Sep 17 00:00:00 2001 From: ebanks Date: Thu, 2 Apr 2009 20:19:39 +0000 Subject: [PATCH] *Massive* speed-up for interval-based by-read traversals. [Could do more optimizing, but this simple fix was good enough for now] git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@266 348d0f76-0448-11de-a6fe-93d51630548a --- .../gatk/traversals/TraverseByReads.java | 2 ++ .../broadinstitute/sting/utils/GenomeLoc.java | 20 +++++++++++++++++-- 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/java/src/org/broadinstitute/sting/gatk/traversals/TraverseByReads.java b/java/src/org/broadinstitute/sting/gatk/traversals/TraverseByReads.java index 0d70379d7..639ad5407 100644 --- a/java/src/org/broadinstitute/sting/gatk/traversals/TraverseByReads.java +++ b/java/src/org/broadinstitute/sting/gatk/traversals/TraverseByReads.java @@ -8,6 +8,7 @@ import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData; import org.broadinstitute.sting.gatk.iterators.ReferenceIterator; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.FastaSequenceFile2; import java.util.List; import java.util.Arrays; @@ -52,6 +53,7 @@ public class TraverseByReads extends TraversalEngine { */ public Object traverseByRead(ReadWalker walker, ArrayList locations) { samReadIter = initializeReads(); + GenomeLoc.setupRefContigOrdering(new FastaSequenceFile2(refFileName)); if (refFileName == null && !walker.requiresOrderedReads() && verifyingSamReadIter != null) { logger.warn(String.format("STATUS: No reference file provided and unordered reads are tolerated, enabling out of order read processing.")); diff --git a/java/src/org/broadinstitute/sting/utils/GenomeLoc.java b/java/src/org/broadinstitute/sting/utils/GenomeLoc.java index 5e1c7869b..8a6d86621 100644 --- a/java/src/org/broadinstitute/sting/utils/GenomeLoc.java +++ b/java/src/org/broadinstitute/sting/utils/GenomeLoc.java @@ -41,6 +41,7 @@ public class GenomeLoc implements Comparable { //public static Map refContigOrdering = null; private static SAMSequenceDictionary contigInfo = null; private static HashMap interns = null; + private static int lastGoodIntervalIndex = 0; public static boolean hasKnownContigOrdering() { return contigInfo != null; @@ -271,10 +272,24 @@ public class GenomeLoc implements Comparable { if ( locs.size() == 0 ) { return true; } else { - for ( GenomeLoc loc : locs ) { + for ( int i = lastGoodIntervalIndex; i < locs.size(); i++ ) { + GenomeLoc loc = locs.get(i); + // since it's ordered, we can do some simple checks to save us tons of time + if ( hasKnownContigOrdering() ) { + int curIndex = getContigIndex(curr.contig); + int locIndex = getContigIndex(loc.contig); + // skip loci before intervals begin + if (curIndex < locIndex) + return false; + // skip loci between intervals + if (curIndex == locIndex && curr.stop < loc.start) + return false; + } //System.out.printf(" Overlap %s vs. %s => %b%n", loc, curr, loc.overlapsP(curr)); - if (loc.overlapsP(curr)) + if (loc.overlapsP(curr)) { + lastGoodIntervalIndex = i; return true; + } } return false; } @@ -403,6 +418,7 @@ public class GenomeLoc implements Comparable { int thisIndex = getContigIndex(thisContig); int thatIndex = getContigIndex(thatContig); + if ( thisIndex == -1 ) { if ( thatIndex == -1 )