From 42eb356782ffd6bae05233e4e4fe92b17cee7fc7 Mon Sep 17 00:00:00 2001 From: ebanks Date: Fri, 3 Apr 2009 18:24:08 +0000 Subject: [PATCH] 1. modifed by read traversals with indexes to be more general 2. GenomeLocs for reads should have ends spanning the read (moved it to GenomeLoc from Utils) 3. Got rid of those stupid unmappable characters from comments in various files git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@289 348d0f76-0448-11de-a6fe-93d51630548a --- .../gatk/dataSources/datum/ReadDatum.java | 2 +- .../gatk/dataSources/shards/ReadShard.java | 2 +- .../gatk/iterators/LocusIteratorByHanger.java | 4 +- .../sting/gatk/iterators/SortSamIterator.java | 4 +- .../gatk/iterators/VerifyingSamIterator.java | 4 +- .../gatk/traversals/TraversalEngine.java | 4 +- .../gatk/traversals/TraverseByReads.java | 12 +++-- .../broadinstitute/sting/utils/GenomeLoc.java | 54 ++++++++++++------- .../org/broadinstitute/sting/utils/Utils.java | 4 -- 9 files changed, 52 insertions(+), 38 deletions(-) diff --git a/java/src/org/broadinstitute/sting/gatk/dataSources/datum/ReadDatum.java b/java/src/org/broadinstitute/sting/gatk/dataSources/datum/ReadDatum.java index 5552af496..e45babdb7 100644 --- a/java/src/org/broadinstitute/sting/gatk/dataSources/datum/ReadDatum.java +++ b/java/src/org/broadinstitute/sting/gatk/dataSources/datum/ReadDatum.java @@ -60,6 +60,6 @@ public class ReadDatum implements Datum { * @return a genome loc that details the region that our read spans. */ public GenomeLoc getSequenceLocation() { - return Utils.genomicLocationOf(sam); + return GenomeLoc.genomicLocationOf(sam); } } diff --git a/java/src/org/broadinstitute/sting/gatk/dataSources/shards/ReadShard.java b/java/src/org/broadinstitute/sting/gatk/dataSources/shards/ReadShard.java index 5854777a9..4ea2349c0 100644 --- a/java/src/org/broadinstitute/sting/gatk/dataSources/shards/ReadShard.java +++ b/java/src/org/broadinstitute/sting/gatk/dataSources/shards/ReadShard.java @@ -66,7 +66,7 @@ public class ReadShard implements DataShard { final List reads = Arrays.asList(read); // put together the genome location - final GenomeLoc loc = Utils.genomicLocationOf(read); + final GenomeLoc loc = GenomeLoc.genomicLocationOf(read); // Offset of a single read is always 0 List offsets = Arrays.asList(0); diff --git a/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByHanger.java b/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByHanger.java index 276adb5c7..afa9f873c 100755 --- a/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByHanger.java +++ b/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByHanger.java @@ -155,7 +155,7 @@ public class LocusIteratorByHanger extends LocusIterator { return true; else { final SAMRecord read = it.peek(); - GenomeLoc readLoc = Utils.genomicLocationOf(read); + GenomeLoc readLoc = GenomeLoc.genomicLocationOf(read); final boolean coveredP = currentPositionIsFullyCovered(readLoc); //System.out.printf("CoverP = %s => %b%n", readLoc, coveredP); return coveredP; @@ -177,7 +177,7 @@ public class LocusIteratorByHanger extends LocusIterator { SAMRecord read = it.next(); justCleared = false; - GenomeLoc readLoc = Utils.genomicLocationOf(read); + GenomeLoc readLoc = GenomeLoc.genomicLocationOf(read); if ( DEBUG ) { logger.debug(String.format(" Expanding window sizes %d with %d : left=%s, right=%s, readLoc = %s, cmp=%d%n", readHanger.size(), incrementSize, diff --git a/java/src/org/broadinstitute/sting/gatk/iterators/SortSamIterator.java b/java/src/org/broadinstitute/sting/gatk/iterators/SortSamIterator.java index 1de48cfcb..dfa9d8fb2 100755 --- a/java/src/org/broadinstitute/sting/gatk/iterators/SortSamIterator.java +++ b/java/src/org/broadinstitute/sting/gatk/iterators/SortSamIterator.java @@ -53,8 +53,8 @@ public class SortSamIterator implements Iterator { } public int compareTo(ComparableSAMRecord o) { - GenomeLoc myLoc = Utils.genomicLocationOf(record); - GenomeLoc hisLoc = Utils.genomicLocationOf(o.getRecord()); + GenomeLoc myLoc = GenomeLoc.genomicLocationOf(record); + GenomeLoc hisLoc = GenomeLoc.genomicLocationOf(o.getRecord()); return myLoc.compareTo(hisLoc); } } diff --git a/java/src/org/broadinstitute/sting/gatk/iterators/VerifyingSamIterator.java b/java/src/org/broadinstitute/sting/gatk/iterators/VerifyingSamIterator.java index b68b08238..373c45bae 100644 --- a/java/src/org/broadinstitute/sting/gatk/iterators/VerifyingSamIterator.java +++ b/java/src/org/broadinstitute/sting/gatk/iterators/VerifyingSamIterator.java @@ -55,8 +55,8 @@ public class VerifyingSamIterator implements Iterator { if ( last == null || cur.getReadUnmappedFlag() ) return false; else { - GenomeLoc lastLoc = Utils.genomicLocationOf( last ); - GenomeLoc curLoc = Utils.genomicLocationOf( cur ); + GenomeLoc lastLoc = GenomeLoc.genomicLocationOf( last ); + GenomeLoc curLoc = GenomeLoc.genomicLocationOf( cur ); return curLoc.compareTo(lastLoc) == -1; } } diff --git a/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java b/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java index 48e4293d0..eb96da5b3 100755 --- a/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java +++ b/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java @@ -188,7 +188,7 @@ public abstract class TraversalEngine { * regions specified by the location string. The string is of the form: * Of the form: loc1;loc2;... * Where each locN can be: - * Ôchr2Õ, Ôchr2:1000000Õ or Ôchr2:1,000,000-2,000,000Õ + * 'chr2', 'chr2:1000000' or 'chr2:1,000,000-2,000,000' * * @param locStr */ @@ -201,7 +201,7 @@ public abstract class TraversalEngine { * regions specified by the location string. The string is of the form: * Of the form: loc1;loc2;... * Where each locN can be: - * Ôchr2Õ, Ôchr2:1000000Õ or Ôchr2:1,000,000-2,000,000Õ + * 'chr2', 'chr2:1000000' or 'chr2:1,000,000-2,000,000' * * @param file_name */ diff --git a/java/src/org/broadinstitute/sting/gatk/traversals/TraverseByReads.java b/java/src/org/broadinstitute/sting/gatk/traversals/TraverseByReads.java index 8d09619c8..36a14a355 100644 --- a/java/src/org/broadinstitute/sting/gatk/traversals/TraverseByReads.java +++ b/java/src/org/broadinstitute/sting/gatk/traversals/TraverseByReads.java @@ -8,12 +8,12 @@ import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData; import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; import org.broadinstitute.sting.gatk.iterators.ReferenceIterator; import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.FastaSequenceFile2; import java.util.List; import java.util.Arrays; import java.util.ArrayList; +import java.util.LinkedList; import java.io.File; import net.sf.samtools.SAMRecord; @@ -54,7 +54,8 @@ public class TraverseByReads extends TraversalEngine { */ public Object traverseByRead(ReadWalker walker, ArrayList locations) { samReadIter = initializeReads(); - GenomeLoc.setupRefContigOrdering(new FastaSequenceFile2(refFileName)); + if ( refFileName != null && !locations.isEmpty() ) + GenomeLoc.setupRefContigOrdering(new FastaSequenceFile2(refFileName)); if (refFileName == null && !walker.requiresOrderedReads() && verifyingSamReadIter != null) { logger.warn(String.format("STATUS: No reference file provided and unordered reads are tolerated, enabling out of order read processing.")); @@ -72,13 +73,15 @@ public class TraverseByReads extends TraversalEngine { List offsets = Arrays.asList(0); // Offset of a single read is always 0 boolean done = false; + // copy the locations here in case we ever want to use the full list again later and so that we can remove efficiently + LinkedList notYetTraversedLocations = new LinkedList(locations); while (samReadIter.hasNext() && !done) { this.nRecords++; // get the next read final SAMRecord read = samReadIter.next(); final List reads = Arrays.asList(read); - GenomeLoc loc = Utils.genomicLocationOf(read); + GenomeLoc loc = GenomeLoc.genomicLocationOf(read); // Jump forward in the reference to this locus location LocusContext locus = new LocusContext(loc, reads, offsets); @@ -87,7 +90,8 @@ public class TraverseByReads extends TraversalEngine { locus.setReferenceContig(refSite.getCurrentContig()); } - if (GenomeLoc.inLocations(loc, locations)) { + GenomeLoc.removePastLocs(loc, notYetTraversedLocations); + if (GenomeLoc.overlapswithSortedLocsP(loc, notYetTraversedLocations, locations.isEmpty())) { // // execute the walker contact diff --git a/java/src/org/broadinstitute/sting/utils/GenomeLoc.java b/java/src/org/broadinstitute/sting/utils/GenomeLoc.java index 8a6d86621..a7a49ff48 100644 --- a/java/src/org/broadinstitute/sting/utils/GenomeLoc.java +++ b/java/src/org/broadinstitute/sting/utils/GenomeLoc.java @@ -8,6 +8,7 @@ import net.sf.functionalj.Functions; import net.sf.functionalj.util.Operators; import net.sf.samtools.SAMSequenceDictionary; import net.sf.samtools.SAMSequenceRecord; +import net.sf.samtools.SAMRecord; import java.util.*; import java.util.regex.Pattern; @@ -41,7 +42,6 @@ public class GenomeLoc implements Comparable { //public static Map refContigOrdering = null; private static SAMSequenceDictionary contigInfo = null; private static HashMap interns = null; - private static int lastGoodIntervalIndex = 0; public static boolean hasKnownContigOrdering() { return contigInfo != null; @@ -129,6 +129,10 @@ public class GenomeLoc implements Comparable { this( new String(toCopy.getContig()), toCopy.getStart(), toCopy.getStop() ); } + public static GenomeLoc genomicLocationOf(final SAMRecord read) { + return new GenomeLoc(read.getReferenceName(), read.getAlignmentStart(), read.getAlignmentEnd()); + } + // -------------------------------------------------------------------------------------------------------------- // // Parsing string representations @@ -140,7 +144,7 @@ public class GenomeLoc implements Comparable { } public static GenomeLoc parseGenomeLoc( final String str ) { - // Ôchr2Õ, Ôchr2:1000000Õ or Ôchr2:1,000,000-2,000,000Õ + // 'chr2', 'chr2:1000000' or 'chr2:1,000,000-2,000,000' //System.out.printf("Parsing location '%s'%n", str); final Pattern regex1 = Pattern.compile("([\\w&&[^:]]+)$"); // matches case 1 @@ -204,7 +208,7 @@ public class GenomeLoc implements Comparable { public static ArrayList parseGenomeLocs(final String str) { // Of the form: loc1;loc2;... // Where each locN can be: - // Ôchr2Õ, Ôchr2:1000000Õ or Ôchr2:1,000,000-2,000,000Õ + // 'chr2', 'chr2:1000000' or 'chr2:1,000,000-2,000,000' StdReflect reflect = new JdkStdReflect(); FunctionN parseOne = reflect.staticFunction(GenomeLoc.class, "parseGenomeLoc", String.class); Function1 f1 = parseOne.f1(); @@ -272,29 +276,40 @@ public class GenomeLoc implements Comparable { if ( locs.size() == 0 ) { return true; } else { - for ( int i = lastGoodIntervalIndex; i < locs.size(); i++ ) { - GenomeLoc loc = locs.get(i); - // since it's ordered, we can do some simple checks to save us tons of time - if ( hasKnownContigOrdering() ) { - int curIndex = getContigIndex(curr.contig); - int locIndex = getContigIndex(loc.contig); - // skip loci before intervals begin - if (curIndex < locIndex) - return false; - // skip loci between intervals - if (curIndex == locIndex && curr.stop < loc.start) - return false; - } + for ( GenomeLoc loc : locs ) { //System.out.printf(" Overlap %s vs. %s => %b%n", loc, curr, loc.overlapsP(curr)); - if (loc.overlapsP(curr)) { - lastGoodIntervalIndex = i; + if (loc.overlapsP(curr)) return true; - } } return false; } } + public static void removePastLocs(GenomeLoc curr, List locs) { + while ( !locs.isEmpty() && curr.isPast(locs.get(0)) ) { + //System.out.println("At: " + curr + ", removing: " + locs.get(0)); + locs.remove(0); + } + } + + public static boolean overlapswithSortedLocsP(GenomeLoc curr, List locs, boolean returnTrueIfEmpty) { + if ( locs.isEmpty() ) + return returnTrueIfEmpty; + + // skip loci before intervals begin + if ( hasKnownContigOrdering() && getContigIndex(curr.contig) < getContigIndex(locs.get(0).contig) ) + return false; + + for ( GenomeLoc loc : locs ) { + //System.out.printf(" Overlap %s vs. %s => %b%n", loc, curr, loc.overlapsP(curr)); + if ( loc.overlapsP(curr) ) + return true; + if ( curr.compareTo(loc) < 0 ) + return false; + } + return false; + } + // // Accessors and setters // @@ -418,7 +433,6 @@ public class GenomeLoc implements Comparable { int thisIndex = getContigIndex(thisContig); int thatIndex = getContigIndex(thatContig); - if ( thisIndex == -1 ) { if ( thatIndex == -1 ) diff --git a/java/src/org/broadinstitute/sting/utils/Utils.java b/java/src/org/broadinstitute/sting/utils/Utils.java index 4a3033fc3..81c3d1726 100755 --- a/java/src/org/broadinstitute/sting/utils/Utils.java +++ b/java/src/org/broadinstitute/sting/utils/Utils.java @@ -180,10 +180,6 @@ public class Utils { return new String(basesAsbytes); } - public static GenomeLoc genomicLocationOf(final SAMRecord read) { - return new GenomeLoc(read.getReferenceName(), read.getAlignmentStart()); - } - private static final Map readFlagNames = new HashMap();