From 6df19ab7934d463d2b3879436650d460a90854bb Mon Sep 17 00:00:00 2001 From: depristo Date: Tue, 24 Mar 2009 20:55:34 +0000 Subject: [PATCH] Support for byInterval traversals for Jared. Do not use them. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@175 348d0f76-0448-11de-a6fe-93d51630548a --- .../sting/gatk/GenomeAnalysisTK.java | 5 +- .../sting/gatk/TraversalEngine.java | 150 +++++++++++------- .../gatk/iterators/ReferenceIterator.java | 2 +- .../gatk/iterators/SamQueryIterator.java | 3 +- .../gatk/iterators/VerifyingSamIterator.java | 18 ++- .../broadinstitute/sting/utils/RefHanger.java | 6 + 6 files changed, 119 insertions(+), 65 deletions(-) diff --git a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisTK.java b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisTK.java index 39512935b..cfa48ed57 100644 --- a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisTK.java +++ b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisTK.java @@ -166,7 +166,10 @@ public class GenomeAnalysisTK extends CommandLineProgram { try { LocusWalker walker = (LocusWalker) my_walker; - engine.traverseByLoci(walker); + if ( INTERVALS_FILE == null ) + engine.traverseByLoci(walker); + else + engine.traverseByLociByInterval(walker); } catch (java.lang.ClassCastException e) { // I guess we're a read walker LOL diff --git a/java/src/org/broadinstitute/sting/gatk/TraversalEngine.java b/java/src/org/broadinstitute/sting/gatk/TraversalEngine.java index 3bd2ecad4..1f7126960 100755 --- a/java/src/org/broadinstitute/sting/gatk/TraversalEngine.java +++ b/java/src/org/broadinstitute/sting/gatk/TraversalEngine.java @@ -14,6 +14,7 @@ import net.sf.samtools.SAMFileReader; import net.sf.samtools.SAMFileReader.ValidationStringency; import net.sf.samtools.SAMRecord; import net.sf.samtools.util.RuntimeIOException; +import net.sf.samtools.util.CloseableIterator; import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.iterators.*; import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedData; @@ -82,6 +83,7 @@ public class TraversalEngine { public boolean DEBUGGING = false; public boolean beSafeP = true; public boolean SORT_ON_FLY = false; + public boolean FILTER_UNSORTED_READS = false; public int MAX_ON_FLY_SORTS = 100000; public long N_RECORDS_TO_PRINT = 100000; public int THREADED_IO_BUFFER_SIZE = 10000; @@ -139,6 +141,12 @@ public class TraversalEngine { this.beSafeP = beSafeP; } + public void setFilterUnsortedReads(final boolean filterUnsorted) { + if (!filterUnsorted) + logger.warn("*** Turning on filtering of out of order reads, I *really* hope you know what you are doing, as you are removing data..."); + this.FILTER_UNSORTED_READS = filterUnsorted; + } + public void setSortOnFly(final boolean SORT_ON_FLY) { if (SORT_ON_FLY) logger.info("Sorting read file on the fly: max reads allowed is " + MAX_ON_FLY_SORTS); @@ -220,7 +228,7 @@ public class TraversalEngine { Collection result = Functions.map(f1, Arrays.asList(str.split(";"))); GenomeLoc[] locs = (GenomeLoc[]) result.toArray(new GenomeLoc[0]); Arrays.sort(locs); - logger.debug(" Locations are: " + Utils.join("\n", Functions.map(Operators.toString, Arrays.asList(locs)))); + System.out.println(" Locations are: " + Utils.join("\n", Functions.map(Operators.toString, Arrays.asList(locs)))); return locs; } catch (Exception e) { logger.fatal(String.format("Invalid locations string: %s, format is loc1;loc2; where each locN can be 'chr2', 'chr2:1000000' or 'chr2:1,000,000-2,000,000'", str)); @@ -239,7 +247,7 @@ public class TraversalEngine { if (this.locs == null) { return true; } else { - for (GenomeLoc loc : this.locs) { + for ( GenomeLoc loc : this.locs ) { //System.out.printf(" Overlap %s vs. %s => %b%n", loc, curr, loc.overlapsP(curr)); if (loc.overlapsP(curr)) return true; @@ -348,7 +356,6 @@ public class TraversalEngine { public boolean initialize(final boolean THREADED_IO) { lastProgressPrintTime = startTime = System.currentTimeMillis(); initializeReference(); - initializeReads(THREADED_IO); // Initial the reference ordered data iterators initializeRODs(); @@ -485,6 +492,7 @@ public class TraversalEngine { * really change this to handle indel containing reads. */ class locusStreamFilterFunc implements SamRecordFilter { + SAMRecord lastRead = null; public boolean filterOut(SAMRecord rec) { boolean result = false; String why = ""; @@ -500,7 +508,8 @@ public class TraversalEngine { nBadAlignments++; result = true; why = "No alignment start"; - } else { + } + else { result = false; } @@ -535,6 +544,8 @@ public class TraversalEngine { * @return 0 on success */ protected int traverseByLoci(LocusWalker walker) { + initializeReads(false); + verifySortOrder(true); // prepare the read filtering read iterator and provide it to a new locus iterator @@ -542,6 +553,7 @@ public class TraversalEngine { //LocusIterator iter = new SingleLocusIterator(filterIter); LocusIterator iter = new LocusIteratorByHanger(filterIter); + // initialize the walker object walker.initialize(); // Initialize the T sum using the walker @@ -549,59 +561,14 @@ public class TraversalEngine { boolean done = false; GenomeLoc prevLoc = null; - int current_interval_index = -1; - int current_interval_offset = -1; - while (iter.hasNext() && !done) { this.nRecords++; // actually get the read and hand it to the walker LocusContext locus = iter.next(); - // Poor man's version of index LOL - // HALP! I HAZ 10K INTERVALS 2 INDX - if (((this.locs != null) && (this.locs.length != 0)) && ((current_interval_index == -1) || (!locus.getLocation().overlapsP(this.locs[current_interval_index])))) { - // Advance to the next locus. - current_interval_index += 1; - current_interval_offset = 0; + if ( inLocations(locus.getLocation()) ) { - if (this.locs.length <= current_interval_index) { - done = true; - break; - } - - //System.out.format("DEBUG Seeking from %s to %s\n", locus.getLocation().toString(), this.locs[current_interval_index].toString()); - - while ((this.locs.length > current_interval_index) && (!locus.getLocation().overlapsP(this.locs[current_interval_index])) && (iter.hasNext())) { - switch (locus.getLocation().compareTo(this.locs[current_interval_index])) { - case -1: - locus = iter.next(); - //System.out.format("DEBUG at %s\n", locus.getLocation().toString()); - break; - case 0: - break; - case 1: - current_interval_index += 1; - current_interval_offset = 0; - if (this.locs.length <= current_interval_index) { - done = true; - break; - } - //System.out.format("DEBUG Giving up on old locus, Seeking from %s to %s\n", locus.getLocation().toString(), this.locs[current_interval_index].toString()); - break; - } - } - if (this.locs.length <= current_interval_index) { - done = true; - break; - } - - //System.out.format("DEBUG Got there.\n"); - } else { - current_interval_offset += 1; - } - - { //System.out.format("Working at %s\n", locus.getLocation().toString()); // Jump forward in the reference to this locus location @@ -613,8 +580,7 @@ public class TraversalEngine { // Iterate forward to get all reference ordered data covering this locus final List rodData = getReferenceOrderedDataAtLocus(rodIters, locus.getLocation()); - - logger.debug(String.format(" Reference: %s:%d %c", refSite.getCurrentContig().getName(), refSite.getPosition(), refBase)); + logger.debug(String.format(" Reference: %s:%d %c", refSite.getCurrentContig().getName(), refSite.getPosition(), refBase)); // // Execute our contract with the walker. Call filter, map, and reduce @@ -630,12 +596,82 @@ public class TraversalEngine { done = true; } - + printProgress("loci", locus.getLocation()); + if (pastFinalLocation(locus.getLocation())) + done = true; } + } - printProgress("loci", locus.getLocation()); - if (pastFinalLocation(locus.getLocation())) - done = true; + printOnTraversalDone("loci", sum); + walker.onTraversalDone(); + return 0; + } + + + protected int traverseByLociByInterval(LocusWalker walker) { + //verifySortOrder(true); + + // initialize the walker object + walker.initialize(); + // Initialize the T sum using the walker + T sum = walker.reduceInit(); + + for ( GenomeLoc interval : locs ) { + System.out.printf("Processing locus %s%n", interval.toString()); + GenomeLoc[] intervalArray = { interval }; + samReader = new SAMFileReader(readsFile, true); + samReader.setValidationStringency(strictness); + + CloseableIterator readIters = samReader.queryOverlapping( interval.getContig(), + (int)interval.getStart(), + (int)interval.getStop() ); + + // prepare the read filtering read iterator and provide it to a new locus iterator + FilteringIterator filterIter = new FilteringIterator(readIters, new locusStreamFilterFunc()); + + boolean done = false; + LocusIterator iter = new LocusIteratorByHanger(filterIter); + while (iter.hasNext() && !done) { + this.nRecords++; + + // actually get the read and hand it to the walker + LocusContext locus = iter.next(); + + if ( interval.overlapsP(locus.getLocation()) ) { + + //System.out.format("Working at %s\n", locus.getLocation().toString()); + + // Jump forward in the reference to this locus location + final ReferenceIterator refSite; + refSite = refIter.seekForward(locus.getLocation()); + final char refBase = refSite.getBaseAsChar(); + locus.setReferenceContig(refSite.getCurrentContig()); + + // Iterate forward to get all reference ordered data covering this locus + final List rodData = getReferenceOrderedDataAtLocus(rodIters, locus.getLocation()); + + logger.debug(String.format(" Reference: %s:%d %c", refSite.getCurrentContig().getName(), refSite.getPosition(), refBase)); + + // + // Execute our contract with the walker. Call filter, map, and reduce + // + final boolean keepMeP = walker.filter(rodData, refBase, locus); + if (keepMeP) { + M x = walker.map(rodData, refBase, locus); + sum = walker.reduce(x, sum); + } + + if (this.maxReads > 0 && this.nRecords > this.maxReads) { + logger.warn(String.format("Maximum number of reads encountered, terminating traversal " + this.nRecords)); + done = true; + } + + printProgress("loci", locus.getLocation()); + if (pastFinalLocation(locus.getLocation())) + done = true; + } + } + readIters.close(); } printOnTraversalDone("loci", sum); @@ -655,6 +691,8 @@ public class TraversalEngine { * @return 0 on success */ protected int traverseByRead(ReadWalker walker) { + initializeReads(false); + if (refFileName == null && !walker.requiresOrderedReads() && verifyingSamReadIter != null) { logger.warn(String.format("STATUS: No reference file provided and unordered reads are tolerated, enabling out of order read processing.")); if (verifyingSamReadIter != null) diff --git a/java/src/org/broadinstitute/sting/gatk/iterators/ReferenceIterator.java b/java/src/org/broadinstitute/sting/gatk/iterators/ReferenceIterator.java index eb4bc5194..0cf77412f 100755 --- a/java/src/org/broadinstitute/sting/gatk/iterators/ReferenceIterator.java +++ b/java/src/org/broadinstitute/sting/gatk/iterators/ReferenceIterator.java @@ -142,7 +142,7 @@ public class ReferenceIterator implements Iterator { int cmpContigs = GenomeLoc.compareContigs(seekContigName, currentContig.getName()); - if ( cmpContigs == -1 ) { + if ( cmpContigs == -1 && false ) { // todo: fixed // The contig we are looking for is before the currentContig -- it's an error throw new IllegalArgumentException(String.format("Invalid seek to %s from %s, which is usually due to out of order reads%n", new GenomeLoc(currentContig.getName(), seekOffset), new GenomeLoc(currentContig.getName(), offset))); diff --git a/java/src/org/broadinstitute/sting/gatk/iterators/SamQueryIterator.java b/java/src/org/broadinstitute/sting/gatk/iterators/SamQueryIterator.java index 27c4937b9..75432f718 100755 --- a/java/src/org/broadinstitute/sting/gatk/iterators/SamQueryIterator.java +++ b/java/src/org/broadinstitute/sting/gatk/iterators/SamQueryIterator.java @@ -62,8 +62,9 @@ public class SamQueryIterator implements Iterator { */ private void bumpToNextSAMRecord() { // If there's a record still waiting in the current iterator, do nothing. - if( recordIter.hasNext() ) + if( recordIter.hasNext() ) { return; + } // Otherwise, find the next record. recordIter.close(); diff --git a/java/src/org/broadinstitute/sting/gatk/iterators/VerifyingSamIterator.java b/java/src/org/broadinstitute/sting/gatk/iterators/VerifyingSamIterator.java index 54dbe3101..b68b08238 100644 --- a/java/src/org/broadinstitute/sting/gatk/iterators/VerifyingSamIterator.java +++ b/java/src/org/broadinstitute/sting/gatk/iterators/VerifyingSamIterator.java @@ -19,6 +19,7 @@ public class VerifyingSamIterator implements Iterator { Iterator it; SAMRecord last = null; boolean checkOrderP = true; + long nOutOfOrderReads = 0; public VerifyingSamIterator(Iterator it) { this.it = it; @@ -44,14 +45,19 @@ public class VerifyingSamIterator implements Iterator { } public void verifyRecord( final SAMRecord last, final SAMRecord cur ) { - if ( checkOrderP && ! cur.getReadUnmappedFlag() ) { + if ( checkOrderP && isOutOfOrder(last, cur) ) { + this.last = null; + throw new RuntimeIOException(String.format("Reads are out of order:%nlast:%n%s%ncurrent:%n%s%n", last.format(), cur.format()) ); + } + } + + public static boolean isOutOfOrder( final SAMRecord last, final SAMRecord cur ) { + if ( last == null || cur.getReadUnmappedFlag() ) + return false; + else { GenomeLoc lastLoc = Utils.genomicLocationOf( last ); GenomeLoc curLoc = Utils.genomicLocationOf( cur ); - - //System.out.printf("VerifyingRecords %s %s%n", lastLoc, curLoc ); - - if ( curLoc.compareTo(lastLoc) == -1 ) - throw new RuntimeIOException(String.format("Reads are out of order:%nlast:%n%s%ncurrent%n%s%n", last.format(), cur.format()) ); + return curLoc.compareTo(lastLoc) == -1; } } diff --git a/java/src/org/broadinstitute/sting/utils/RefHanger.java b/java/src/org/broadinstitute/sting/utils/RefHanger.java index e6203bce1..f81050e1e 100755 --- a/java/src/org/broadinstitute/sting/utils/RefHanger.java +++ b/java/src/org/broadinstitute/sting/utils/RefHanger.java @@ -61,9 +61,15 @@ public class RefHanger { hangers = new ArrayList(); } + public void clear() { + hangers.clear(); + //System.out.printf("leftLoc is %s%n", getLeftLoc()); + } + protected int getLeftOffset() { return 0; } protected int getRightOffset() { return hangers.size() - 1; } protected int getOffset(GenomeLoc loc) { + //System.out.printf("Loc: %s vs %s%n", loc, getLeftLoc()); return loc.minus(getLeftLoc()); }