From e77d735e08528e88fe7f38705beef9c8ce0e326f Mon Sep 17 00:00:00 2001 From: depristo Date: Sun, 22 Mar 2009 19:54:02 +0000 Subject: [PATCH] New reference iterator that works with the new FastaSequenceFile seek operations. Greatly improves performance of jumping around in the genome. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@139 348d0f76-0448-11de-a6fe-93d51630548a --- .../gatk/iterators/ReferenceIterator.java | 114 ++++++++++-------- 1 file changed, 65 insertions(+), 49 deletions(-) diff --git a/core/java/src/org/broadinstitute/sting/gatk/iterators/ReferenceIterator.java b/core/java/src/org/broadinstitute/sting/gatk/iterators/ReferenceIterator.java index 3f4e0902d..eb4bc5194 100755 --- a/core/java/src/org/broadinstitute/sting/gatk/iterators/ReferenceIterator.java +++ b/core/java/src/org/broadinstitute/sting/gatk/iterators/ReferenceIterator.java @@ -3,11 +3,14 @@ package org.broadinstitute.sting.gatk.iterators; import edu.mit.broad.picard.reference.ReferenceSequenceFile; import edu.mit.broad.picard.reference.ReferenceSequence; import net.sf.samtools.util.StringUtil; +import net.sf.samtools.util.RuntimeIOException; import java.util.Iterator; import java.util.NoSuchElementException; +import java.io.IOException; import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.FastaSequenceFile2; /** * Created by IntelliJ IDEA. @@ -18,14 +21,17 @@ import org.broadinstitute.sting.utils.GenomeLoc; */ public class ReferenceIterator implements Iterator { + // enable debugging output? + private final boolean DEBUG = false; + // The reference sequence file generator - private ReferenceSequenceFile refFile; + private FastaSequenceFile2 refFile; private ReferenceSequence currentContig = null; - private ReferenceSequence nextContig = null; + //private ReferenceSequence nextContig = null; private long offset = -1; - public ReferenceIterator( ReferenceSequenceFile refFile ) { + public ReferenceIterator( FastaSequenceFile2 refFile ) { this.refFile = refFile; } @@ -55,27 +61,20 @@ public class ReferenceIterator implements Iterator { return true; } else { - return loadNextContig(); + return readNextContig(); } } public ReferenceIterator next() { if ( currentContig != null ) { - //System.out.printf(" -> %s:%d %d%n", currentContig.getName(), offset, currentContig.length()); + if ( DEBUG ) System.out.printf(" -> %s:%d %d%n", currentContig.getName(), offset, currentContig.length()); } offset++; // move on to the next position if ( currentContig == null || offset >= currentContig.length() ) { // We need to update the contig - //System.out.printf(" -> Updating length%n"); - if ( nextContig != null ) { - // We've already loaded the next contig, swap it in, and recursively call next - swapNextContig(); - return next(); - } - else if ( loadNextContig() ){ + if ( readNextContig() ){ // We sucessfully loaded the next contig, recursively call next - offset = -1; return next(); } else { @@ -108,49 +107,61 @@ public class ReferenceIterator implements Iterator { return seekForwardOffset(contigName, pos - 1); } - private ReferenceIterator seekForwardOffset(final String contigName, final long seekOffset) { - assert contigName != null : "contigName is null"; + /** + * Helper routine that doesn't move the contigs around, it just checks that everything is kosher in the seek + * within this chromosome + * @param seekContigName name for printing pursues, asserted to be the current contig name + * @param seekOffset where we want to be in this contig + * @return this setup to be at seekoffset within seekContigName + */ + private ReferenceIterator seekForwardOffsetOnSameContig(final String seekContigName, final long seekOffset) { + assert seekContigName.equals(currentContig.getName()) : String.format("only works on this contig, but the current %s and sought %s contigs are different!", currentContig.getName(), seekContigName); + + // we're somewhere on this contig + if ( seekOffset < offset || seekOffset >= currentContig.length() ) { + // bad boy -- can't go backward safely or just beyond the contig length + throw new IllegalArgumentException(String.format("Invalid seek to %s from %s, which is usually due to out of order reads%n", + new GenomeLoc(currentContig.getName(), seekOffset), new GenomeLoc(currentContig.getName(), offset))); + } + else { + offset = seekOffset - 1; + return next(); + } + } + + + private ReferenceIterator seekForwardOffset(final String seekContigName, final long seekOffset) { + assert seekContigName != null : "seekContigName is null"; assert seekOffset >= 0 : "seekOffset < 0: " + seekOffset; // jumps us forward in the sequence to the contig / pos if ( currentContig == null ) next(); - //System.out.printf(" -> Seeking to %s %d from %s %d%n", contigName, seekOffset, currentContig.getName(), offset); - int cmpContigs = GenomeLoc.compareContigs(contigName, currentContig.getName()); + if ( DEBUG ) System.out.printf(" -> Seeking to %s %d from %s %d%n", seekContigName, seekOffset, currentContig.getName(), offset); + + int cmpContigs = GenomeLoc.compareContigs(seekContigName, currentContig.getName()); + if ( cmpContigs == -1 ) { // The contig we are looking for is before the currentContig -- it's an error throw new IllegalArgumentException(String.format("Invalid seek to %s from %s, which is usually due to out of order reads%n", new GenomeLoc(currentContig.getName(), seekOffset), new GenomeLoc(currentContig.getName(), offset))); } - else if ( cmpContigs == 0 ) { - // we're somewhere on this contig - if ( seekOffset < offset || seekOffset >= currentContig.length() ) { - // bad boy -- can't go backward safely or just beyond the contig length - throw new IllegalArgumentException(String.format("Invalid seek to %s from %s, which is usually due to out of order reads%n", + else if ( cmpContigs == 1 ) { + // we need to jump forward + if ( DEBUG ) System.out.printf(" -> Seeking in the fasta file to %s from %s%n", seekContigName, currentContig.getName()); + + if ( ! refFile.seekToContig(seekContigName) ) { // ok, do the seek + // a false result indicates a failure, throw a somewhat cryptic call + throw new RuntimeIOException(String.format("Unexpected seek failure from %s from %s%n", new GenomeLoc(currentContig.getName(), seekOffset), new GenomeLoc(currentContig.getName(), offset))); - //return null; - } - else { - offset = seekOffset - 1; - return next(); - } - } - else { - while (true) { - //System.out.printf("Seeking to contig %s, cur=%s, next=%s%n", contigName, currentContig.getName(), - // nextContig != null ? nextContig.getName() : "not loaded yet"); - // go searching through the reference - if ( ! loadNextContig() ) { - // never found anything - return null; - } - else if ( GenomeLoc.compareContigs( nextContig.getName(), contigName ) == 0 ) { - swapNextContig(); - return seekForwardOffset(contigName, seekOffset); - } } + + readNextContig(); // since we haven't failed, we just read in the next contig (which is seekContigName) } + + // at this point, the current contig is seekContigName, so just do a bit more error checking and be done + return seekForwardOffsetOnSameContig( seekContigName, seekOffset ); } @@ -159,15 +170,20 @@ public class ReferenceIterator implements Iterator { // Interal state manipulation // // -------------------------------------------------------------------------------------------------------------- - protected boolean loadNextContig() { + protected boolean readNextContig() { // returns true if we had another contig to load - nextContig = refFile.nextSequence(); - return nextContig != null; + currentContig = refFile.nextSequence(); + offset = -1; + return currentContig != null; } - protected void swapNextContig() { - currentContig = nextContig; - nextContig = null; - offset = -1; + /** + * Simple forwarding method to the refFile itself + * + * @return + */ + public String nextContigName() { + return this.refFile.getNextContigName(); } + }