diff --git a/java/src/org/broadinstitute/sting/gatk/dataSources/datum/ReadDatum.java b/java/src/org/broadinstitute/sting/gatk/dataSources/datum/ReadDatum.java index 5552af496..e45babdb7 100644 --- a/java/src/org/broadinstitute/sting/gatk/dataSources/datum/ReadDatum.java +++ b/java/src/org/broadinstitute/sting/gatk/dataSources/datum/ReadDatum.java @@ -60,6 +60,6 @@ public class ReadDatum implements Datum { * @return a genome loc that details the region that our read spans. */ public GenomeLoc getSequenceLocation() { - return Utils.genomicLocationOf(sam); + return GenomeLoc.genomicLocationOf(sam); } } diff --git a/java/src/org/broadinstitute/sting/gatk/dataSources/shards/ReadShard.java b/java/src/org/broadinstitute/sting/gatk/dataSources/shards/ReadShard.java index 5854777a9..4ea2349c0 100644 --- a/java/src/org/broadinstitute/sting/gatk/dataSources/shards/ReadShard.java +++ b/java/src/org/broadinstitute/sting/gatk/dataSources/shards/ReadShard.java @@ -66,7 +66,7 @@ public class ReadShard implements DataShard { final List reads = Arrays.asList(read); // put together the genome location - final GenomeLoc loc = Utils.genomicLocationOf(read); + final GenomeLoc loc = GenomeLoc.genomicLocationOf(read); // Offset of a single read is always 0 List offsets = Arrays.asList(0); diff --git a/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByHanger.java b/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByHanger.java index 276adb5c7..afa9f873c 100755 --- a/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByHanger.java +++ b/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByHanger.java @@ -155,7 +155,7 @@ public class LocusIteratorByHanger extends LocusIterator { return true; else { final SAMRecord read = it.peek(); - GenomeLoc readLoc = Utils.genomicLocationOf(read); + GenomeLoc readLoc = GenomeLoc.genomicLocationOf(read); final boolean coveredP = currentPositionIsFullyCovered(readLoc); //System.out.printf("CoverP = %s => %b%n", readLoc, coveredP); return coveredP; @@ -177,7 +177,7 @@ public class LocusIteratorByHanger extends LocusIterator { SAMRecord read = it.next(); justCleared = false; - GenomeLoc readLoc = Utils.genomicLocationOf(read); + GenomeLoc readLoc = GenomeLoc.genomicLocationOf(read); if ( DEBUG ) { logger.debug(String.format(" Expanding window sizes %d with %d : left=%s, right=%s, readLoc = %s, cmp=%d%n", readHanger.size(), incrementSize, diff --git a/java/src/org/broadinstitute/sting/gatk/iterators/SortSamIterator.java b/java/src/org/broadinstitute/sting/gatk/iterators/SortSamIterator.java index 1de48cfcb..dfa9d8fb2 100755 --- a/java/src/org/broadinstitute/sting/gatk/iterators/SortSamIterator.java +++ b/java/src/org/broadinstitute/sting/gatk/iterators/SortSamIterator.java @@ -53,8 +53,8 @@ public class SortSamIterator implements Iterator { } public int compareTo(ComparableSAMRecord o) { - GenomeLoc myLoc = Utils.genomicLocationOf(record); - GenomeLoc hisLoc = Utils.genomicLocationOf(o.getRecord()); + GenomeLoc myLoc = GenomeLoc.genomicLocationOf(record); + GenomeLoc hisLoc = GenomeLoc.genomicLocationOf(o.getRecord()); return myLoc.compareTo(hisLoc); } } diff --git a/java/src/org/broadinstitute/sting/gatk/iterators/VerifyingSamIterator.java b/java/src/org/broadinstitute/sting/gatk/iterators/VerifyingSamIterator.java index b68b08238..373c45bae 100644 --- a/java/src/org/broadinstitute/sting/gatk/iterators/VerifyingSamIterator.java +++ b/java/src/org/broadinstitute/sting/gatk/iterators/VerifyingSamIterator.java @@ -55,8 +55,8 @@ public class VerifyingSamIterator implements Iterator { if ( last == null || cur.getReadUnmappedFlag() ) return false; else { - GenomeLoc lastLoc = Utils.genomicLocationOf( last ); - GenomeLoc curLoc = Utils.genomicLocationOf( cur ); + GenomeLoc lastLoc = GenomeLoc.genomicLocationOf( last ); + GenomeLoc curLoc = GenomeLoc.genomicLocationOf( cur ); return curLoc.compareTo(lastLoc) == -1; } } diff --git a/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java b/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java index 48e4293d0..eb96da5b3 100755 --- a/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java +++ b/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java @@ -188,7 +188,7 @@ public abstract class TraversalEngine { * regions specified by the location string. The string is of the form: * Of the form: loc1;loc2;... * Where each locN can be: - * Ôchr2Õ, Ôchr2:1000000Õ or Ôchr2:1,000,000-2,000,000Õ + * 'chr2', 'chr2:1000000' or 'chr2:1,000,000-2,000,000' * * @param locStr */ @@ -201,7 +201,7 @@ public abstract class TraversalEngine { * regions specified by the location string. The string is of the form: * Of the form: loc1;loc2;... * Where each locN can be: - * Ôchr2Õ, Ôchr2:1000000Õ or Ôchr2:1,000,000-2,000,000Õ + * 'chr2', 'chr2:1000000' or 'chr2:1,000,000-2,000,000' * * @param file_name */ diff --git a/java/src/org/broadinstitute/sting/gatk/traversals/TraverseByReads.java b/java/src/org/broadinstitute/sting/gatk/traversals/TraverseByReads.java index 8d09619c8..36a14a355 100644 --- a/java/src/org/broadinstitute/sting/gatk/traversals/TraverseByReads.java +++ b/java/src/org/broadinstitute/sting/gatk/traversals/TraverseByReads.java @@ -8,12 +8,12 @@ import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData; import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; import org.broadinstitute.sting.gatk.iterators.ReferenceIterator; import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.FastaSequenceFile2; import java.util.List; import java.util.Arrays; import java.util.ArrayList; +import java.util.LinkedList; import java.io.File; import net.sf.samtools.SAMRecord; @@ -54,7 +54,8 @@ public class TraverseByReads extends TraversalEngine { */ public Object traverseByRead(ReadWalker walker, ArrayList locations) { samReadIter = initializeReads(); - GenomeLoc.setupRefContigOrdering(new FastaSequenceFile2(refFileName)); + if ( refFileName != null && !locations.isEmpty() ) + GenomeLoc.setupRefContigOrdering(new FastaSequenceFile2(refFileName)); if (refFileName == null && !walker.requiresOrderedReads() && verifyingSamReadIter != null) { logger.warn(String.format("STATUS: No reference file provided and unordered reads are tolerated, enabling out of order read processing.")); @@ -72,13 +73,15 @@ public class TraverseByReads extends TraversalEngine { List offsets = Arrays.asList(0); // Offset of a single read is always 0 boolean done = false; + // copy the locations here in case we ever want to use the full list again later and so that we can remove efficiently + LinkedList notYetTraversedLocations = new LinkedList(locations); while (samReadIter.hasNext() && !done) { this.nRecords++; // get the next read final SAMRecord read = samReadIter.next(); final List reads = Arrays.asList(read); - GenomeLoc loc = Utils.genomicLocationOf(read); + GenomeLoc loc = GenomeLoc.genomicLocationOf(read); // Jump forward in the reference to this locus location LocusContext locus = new LocusContext(loc, reads, offsets); @@ -87,7 +90,8 @@ public class TraverseByReads extends TraversalEngine { locus.setReferenceContig(refSite.getCurrentContig()); } - if (GenomeLoc.inLocations(loc, locations)) { + GenomeLoc.removePastLocs(loc, notYetTraversedLocations); + if (GenomeLoc.overlapswithSortedLocsP(loc, notYetTraversedLocations, locations.isEmpty())) { // // execute the walker contact diff --git a/java/src/org/broadinstitute/sting/utils/GenomeLoc.java b/java/src/org/broadinstitute/sting/utils/GenomeLoc.java index 8a6d86621..a7a49ff48 100644 --- a/java/src/org/broadinstitute/sting/utils/GenomeLoc.java +++ b/java/src/org/broadinstitute/sting/utils/GenomeLoc.java @@ -8,6 +8,7 @@ import net.sf.functionalj.Functions; import net.sf.functionalj.util.Operators; import net.sf.samtools.SAMSequenceDictionary; import net.sf.samtools.SAMSequenceRecord; +import net.sf.samtools.SAMRecord; import java.util.*; import java.util.regex.Pattern; @@ -41,7 +42,6 @@ public class GenomeLoc implements Comparable { //public static Map refContigOrdering = null; private static SAMSequenceDictionary contigInfo = null; private static HashMap interns = null; - private static int lastGoodIntervalIndex = 0; public static boolean hasKnownContigOrdering() { return contigInfo != null; @@ -129,6 +129,10 @@ public class GenomeLoc implements Comparable { this( new String(toCopy.getContig()), toCopy.getStart(), toCopy.getStop() ); } + public static GenomeLoc genomicLocationOf(final SAMRecord read) { + return new GenomeLoc(read.getReferenceName(), read.getAlignmentStart(), read.getAlignmentEnd()); + } + // -------------------------------------------------------------------------------------------------------------- // // Parsing string representations @@ -140,7 +144,7 @@ public class GenomeLoc implements Comparable { } public static GenomeLoc parseGenomeLoc( final String str ) { - // Ôchr2Õ, Ôchr2:1000000Õ or Ôchr2:1,000,000-2,000,000Õ + // 'chr2', 'chr2:1000000' or 'chr2:1,000,000-2,000,000' //System.out.printf("Parsing location '%s'%n", str); final Pattern regex1 = Pattern.compile("([\\w&&[^:]]+)$"); // matches case 1 @@ -204,7 +208,7 @@ public class GenomeLoc implements Comparable { public static ArrayList parseGenomeLocs(final String str) { // Of the form: loc1;loc2;... // Where each locN can be: - // Ôchr2Õ, Ôchr2:1000000Õ or Ôchr2:1,000,000-2,000,000Õ + // 'chr2', 'chr2:1000000' or 'chr2:1,000,000-2,000,000' StdReflect reflect = new JdkStdReflect(); FunctionN parseOne = reflect.staticFunction(GenomeLoc.class, "parseGenomeLoc", String.class); Function1 f1 = parseOne.f1(); @@ -272,29 +276,40 @@ public class GenomeLoc implements Comparable { if ( locs.size() == 0 ) { return true; } else { - for ( int i = lastGoodIntervalIndex; i < locs.size(); i++ ) { - GenomeLoc loc = locs.get(i); - // since it's ordered, we can do some simple checks to save us tons of time - if ( hasKnownContigOrdering() ) { - int curIndex = getContigIndex(curr.contig); - int locIndex = getContigIndex(loc.contig); - // skip loci before intervals begin - if (curIndex < locIndex) - return false; - // skip loci between intervals - if (curIndex == locIndex && curr.stop < loc.start) - return false; - } + for ( GenomeLoc loc : locs ) { //System.out.printf(" Overlap %s vs. %s => %b%n", loc, curr, loc.overlapsP(curr)); - if (loc.overlapsP(curr)) { - lastGoodIntervalIndex = i; + if (loc.overlapsP(curr)) return true; - } } return false; } } + public static void removePastLocs(GenomeLoc curr, List locs) { + while ( !locs.isEmpty() && curr.isPast(locs.get(0)) ) { + //System.out.println("At: " + curr + ", removing: " + locs.get(0)); + locs.remove(0); + } + } + + public static boolean overlapswithSortedLocsP(GenomeLoc curr, List locs, boolean returnTrueIfEmpty) { + if ( locs.isEmpty() ) + return returnTrueIfEmpty; + + // skip loci before intervals begin + if ( hasKnownContigOrdering() && getContigIndex(curr.contig) < getContigIndex(locs.get(0).contig) ) + return false; + + for ( GenomeLoc loc : locs ) { + //System.out.printf(" Overlap %s vs. %s => %b%n", loc, curr, loc.overlapsP(curr)); + if ( loc.overlapsP(curr) ) + return true; + if ( curr.compareTo(loc) < 0 ) + return false; + } + return false; + } + // // Accessors and setters // @@ -418,7 +433,6 @@ public class GenomeLoc implements Comparable { int thisIndex = getContigIndex(thisContig); int thatIndex = getContigIndex(thatContig); - if ( thisIndex == -1 ) { if ( thatIndex == -1 ) diff --git a/java/src/org/broadinstitute/sting/utils/Utils.java b/java/src/org/broadinstitute/sting/utils/Utils.java index 4a3033fc3..81c3d1726 100755 --- a/java/src/org/broadinstitute/sting/utils/Utils.java +++ b/java/src/org/broadinstitute/sting/utils/Utils.java @@ -180,10 +180,6 @@ public class Utils { return new String(basesAsbytes); } - public static GenomeLoc genomicLocationOf(final SAMRecord read) { - return new GenomeLoc(read.getReferenceName(), read.getAlignmentStart()); - } - private static final Map readFlagNames = new HashMap();