From 149ac3d96cee33f42bef481fcd41a0947e44cc92 Mon Sep 17 00:00:00 2001 From: jmaguire Date: Sun, 22 Mar 2009 12:04:11 +0000 Subject: [PATCH] Now iterate over a large set of tiny intervals efficiently. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@130 348d0f76-0448-11de-a6fe-93d51630548a --- .../sting/gatk/GenomeAnalysisTK.java | 17 ++++- .../sting/gatk/TraversalEngine.java | 65 +++++++++++++++---- 2 files changed, 68 insertions(+), 14 deletions(-) diff --git a/core/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisTK.java b/core/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisTK.java index ae704e737..aef2a4b2f 100644 --- a/core/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisTK.java +++ b/core/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisTK.java @@ -1,17 +1,21 @@ package org.broadinstitute.sting.gatk; import net.sf.samtools.SAMFileReader.ValidationStringency; +import net.sf.samtools.SAMSequenceRecord; import edu.mit.broad.picard.cmdline.CommandLineProgram; import edu.mit.broad.picard.cmdline.Usage; import edu.mit.broad.picard.cmdline.Option; +import edu.mit.broad.picard.reference.ReferenceSequenceFileFactory; +import edu.mit.broad.picard.reference.ReferenceSequenceFile; +import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData; import org.broadinstitute.sting.gatk.refdata.rodDbSNP; import org.broadinstitute.sting.gatk.refdata.rodGFF; import java.io.*; -import java.util.HashMap; +import java.util.*; public class GenomeAnalysisTK extends CommandLineProgram { // Usage and parameters @@ -70,6 +74,17 @@ public class GenomeAnalysisTK extends CommandLineProgram { this.engine = new TraversalEngine(INPUT_FILE, REF_FILE_ARG, rods); + // Prepare the sort ordering w.r.t. the sequence dictionary + final ReferenceSequenceFile refFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(REF_FILE_ARG); + List refContigs = refFile.getSequenceDictionary().getSequences(); + HashMap refContigOrdering = new HashMap(); + int i = 0; + for ( SAMSequenceRecord contig : refContigs ) { + refContigOrdering.put(contig.getSequenceName(), i); + i++; + } + GenomeLoc.setContigOrdering(refContigOrdering); + ValidationStringency strictness; if ( STRICTNESS_ARG == null ) { strictness = ValidationStringency.STRICT; diff --git a/core/java/src/org/broadinstitute/sting/gatk/TraversalEngine.java b/core/java/src/org/broadinstitute/sting/gatk/TraversalEngine.java index c1d7195b3..3a6f2085a 100755 --- a/core/java/src/org/broadinstitute/sting/gatk/TraversalEngine.java +++ b/core/java/src/org/broadinstitute/sting/gatk/TraversalEngine.java @@ -203,10 +203,7 @@ public class TraversalEngine { GenomeLoc[] locs = (GenomeLoc[])result.toArray(new GenomeLoc[0]); Arrays.sort(locs); - for ( GenomeLoc l : locs ) - System.out.printf(" -> %s%n", l); - - System.out.printf(" Locations are: %s%n", Utils.join(" ", Functions.map( Operators.toString, Arrays.asList(locs) ) ) ); + System.out.printf(" Locations are: %s%n", Utils.join("\n", Functions.map( Operators.toString, Arrays.asList(locs) ) ) ); return locs; } @@ -220,9 +217,13 @@ public class TraversalEngine { */ public boolean inLocations( GenomeLoc curr ) { if ( this.locs == null ) + { return true; - else { - for ( GenomeLoc loc : this.locs ) { + } + else + { + for ( GenomeLoc loc : this.locs ) + { //System.out.printf(" Overlap %s vs. %s => %b%n", loc, curr, loc.overlapsP(curr)); if ( loc.overlapsP(curr) ) return true; @@ -537,21 +538,59 @@ public class TraversalEngine { boolean done = false; GenomeLoc prevLoc = null; - while ( iter.hasNext() && ! done ) { + int current_interval_index = -1; + int current_interval_offset = -1; + + while ( iter.hasNext() && ! done ) + { this.nRecords++; // actually get the read and hand it to the walker - final LocusContext locus = iter.next(); + LocusContext locus = iter.next(); // Poor man's version of index LOL // HALP! I HAZ 10K INTERVALS 2 INDX - GenomeLoc curLoc = locus.getLocation(); - if ( inLocations(curLoc) ) { - if ( prevLoc != null && curLoc.compareContigs(prevLoc) != 0 ) - System.out.printf("Traversing to next chromosome...%n"); + if ( ((this.locs != null) && (this.locs.length != 0)) && ((current_interval_index == -1) || (!locus.getLocation().overlapsP(this.locs[current_interval_index])))) + { + // Advance to the next locus. + current_interval_index += 1; + current_interval_offset = 0; + + if (this.locs.length <= current_interval_index) { done = true; break; } + + //System.out.format("DEBUG Seeking from %s to %s\n", locus.getLocation().toString(), this.locs[current_interval_index].toString()); + + while ((!locus.getLocation().overlapsP(this.locs[current_interval_index])) && (iter.hasNext())) + { + switch (locus.getLocation().compareTo(this.locs[current_interval_index])) + { + case -1 : + locus = iter.next(); + //System.out.format("DEBUG at %s\n", locus.getLocation().toString()); + break; + case 0 : + break; + case 1 : + current_interval_index += 1; + current_interval_offset = 0; + //System.out.format("DEBUG Giving up on old locus, Seeking from %s to %s\n", locus.getLocation().toString(), this.locs[current_interval_index].toString()); + break; + } + } + + //System.out.format("DEBUG Got there.\n"); + } + else + { + current_interval_offset += 1; + } + + { + //System.out.format("Working at %s\n", locus.getLocation().toString()); // Jump forward in the reference to this locus location - final ReferenceIterator refSite = refIter.seekForward(locus.getLocation()); + final ReferenceIterator refSite; + refSite = refIter.seekForward(locus.getLocation()); final char refBase = refSite.getBaseAsChar(); locus.setReferenceContig(refSite.getCurrentContig());