From aacd72854f59d1559a8697132cd3a6586973449b Mon Sep 17 00:00:00 2001 From: aaron Date: Thu, 5 Nov 2009 05:26:33 +0000 Subject: [PATCH] a fix for a bug Andrey discovered: in read-based interval traversals we're dupplicating reads in rare cases. The problem was that to accomidate a bug in SAM JDK indexing, we were forced to add one to the stop of our QueryOverlapping() calls to ensure we always got all of the overlapping reads. Added a PlusOneFixIterator that wraps other iterators, and eliminates reads that start outside of our intended interval (interval stop - 1). Updated and checked BamToFastqIntegrationTest MD5 sums. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1976 348d0f76-0448-11de-a6fe-93d51630548a --- .../simpleDataSources/ReadStreamPointer.java | 2 +- .../simpleDataSources/SAMDataSource.java | 3 +- .../gatk/iterators/PlusOneFixIterator.java | 114 +++++++++++++++++ .../iterators/PlusOneFixIteratorTest.java | 120 ++++++++++++++++++ .../fasta/BamToFastqIntegrationTest.java | 4 +- 5 files changed, 239 insertions(+), 4 deletions(-) create mode 100644 java/src/org/broadinstitute/sting/gatk/iterators/PlusOneFixIterator.java create mode 100644 java/test/org/broadinstitute/sting/gatk/iterators/PlusOneFixIteratorTest.java diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/ReadStreamPointer.java b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/ReadStreamPointer.java index 5df542a09..c69b74b6f 100644 --- a/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/ReadStreamPointer.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/ReadStreamPointer.java @@ -143,7 +143,7 @@ class MappedReadStreamPointer extends ReadStreamPointer { // over a given interval would occasionally not pick up the last read in that interval. mergingIterator.queryOverlapping( segment.locus.getContig(), (int)segment.locus.getStart(), - (int)segment.locus.getStop()+1); + (int)segment.locus.getStop()+ PlusOneFixIterator.PLUS_ONE_FIX_CONSTANT); return StingSAMIteratorAdapter.adapt(sourceInfo,mergingIterator); } diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/SAMDataSource.java b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/SAMDataSource.java index 89d905a3e..c7c6beb06 100755 --- a/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/SAMDataSource.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/SAMDataSource.java @@ -172,7 +172,8 @@ public class SAMDataSource implements SimpleDataSource { reads.getSupplementalFilters()); // add the new overlapping detection iterator, if we have a last interval and we're a read based shard - if (mLastInterval != null && shard.getShardType() == Shard.ShardType.READ_INTERVAL ) iterator = new IntervalOverlapIterator(iterator,mLastInterval,false); + if (mLastInterval != null && shard.getShardType() == Shard.ShardType.READ_INTERVAL ) + iterator = new PlusOneFixIterator(shard.getGenomeLoc(),new IntervalOverlapIterator(iterator,mLastInterval,false)); mLastInterval = shard.getGenomeLoc(); } else { diff --git a/java/src/org/broadinstitute/sting/gatk/iterators/PlusOneFixIterator.java b/java/src/org/broadinstitute/sting/gatk/iterators/PlusOneFixIterator.java new file mode 100644 index 000000000..267ea99d1 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/iterators/PlusOneFixIterator.java @@ -0,0 +1,114 @@ +package org.broadinstitute.sting.gatk.iterators; + +import net.sf.samtools.SAMRecord; + +import java.util.Iterator; + +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.StingException; +import org.broadinstitute.sting.gatk.Reads; + +/** + * Created by IntelliJ IDEA. + * User: aaronmckenna + * Date: Nov 4, 2009 + * Time: 10:41:02 PM + */ +public class PlusOneFixIterator implements StingSAMIterator { + + /** + * this holds the value that we use to correct our query overlapping calls with, + * but more importantly it ties the ReadStreamPointer code to this so we don't loose + * track of this adjustment across the code. + */ + public static final Integer PLUS_ONE_FIX_CONSTANT = 1; + + /** + * our interval region + */ + private final GenomeLoc mInterval; + + /** + * our iterator + */ + private final StingSAMIterator mIterator; + + /** + * our next record + */ + private SAMRecord mNextRecord; + + /** + * This iterator filters all reads that have their start < PLUS_ONE_FIX_CONSTANT from the end of the + * interval: + *

+ *

+ * (this won't make sense in a non-fixed width font) + * -------------- + * interval | + * ------------------------ + * | good read| + * -------------------------------------------- + * | overlap < PLUS_ONE_FIX_CONSTANT, we drop| + * ------------------------------------------- + *

+ * The problem is that we had to put a +1 on our queryOverlapping() stop position value to SAMTools + * due to an indexing problem. Other iterators don't know about this adjustment though, so calculations + * like overlap fail. This iterator eliminates them before the read is seen by other iterators. + *

+ * create a plus one fix iterator, given: + * + * @param inteveral the interval we're checking + * @param iterator the iterator to draw reads from + */ + public PlusOneFixIterator(GenomeLoc inteveral, StingSAMIterator iterator) { + if (inteveral == null || iterator == null) + throw new StingException("Both parameters to PlusOneFixIterator have to != null"); + + // save off the iterator and the interval + mInterval = inteveral; + mIterator = iterator; + + // pop off the first read + if (mIterator.hasNext()) { + next(); + } + } + + + @Override + public boolean hasNext() { + return (mNextRecord != null); + } + + @Override + public SAMRecord next() { + SAMRecord ret = mNextRecord; + while (mIterator.hasNext()) { + mNextRecord = mIterator.next(); + if (!(mNextRecord.getAlignmentStart() > (mInterval.getStop() - PLUS_ONE_FIX_CONSTANT))) return ret; + } + mNextRecord = null; + return ret; + } + + @Override + public void remove() { + throw new UnsupportedOperationException("REMOVE on PlusOneFixIterator is not supported"); + } + + @Override + public Reads getSourceInfo() { + return this.mIterator.getSourceInfo(); + } + + @Override + public void close() { + this.mIterator.close(); + } + + @Override + public Iterator iterator() { + return this; + } +} diff --git a/java/test/org/broadinstitute/sting/gatk/iterators/PlusOneFixIteratorTest.java b/java/test/org/broadinstitute/sting/gatk/iterators/PlusOneFixIteratorTest.java new file mode 100644 index 000000000..c32d67885 --- /dev/null +++ b/java/test/org/broadinstitute/sting/gatk/iterators/PlusOneFixIteratorTest.java @@ -0,0 +1,120 @@ +package org.broadinstitute.sting.gatk.iterators; + +import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.BaseTest; +import org.junit.BeforeClass; +import org.junit.Test; +import org.junit.Assert; +import static org.junit.Assert.fail; + +import java.io.File; +import java.io.FileNotFoundException; +import java.util.Iterator; + +import net.sf.samtools.SAMFileReader; +import net.sf.samtools.SAMRecord; + +/** + * Created by IntelliJ IDEA. + * User: aaronmckenna + * Date: Nov 4, 2009 + * Time: 11:02:24 PM + */ +public class PlusOneFixIteratorTest extends BaseTest { + private final File bam = new File("/humgen/gsa-scr1/GATK_Data/Validation_Data/index_test.bam"); + private static IndexedFastaSequenceFile seq; + private int chromosomeOneReadCount = 885; + + //GenomeLoc.setupRefContigOrdering(seq.getSequenceDictionary()); + /** + * This function does the setup of our parser, before each method call. + *

+ * Called before every test case method. + */ + @BeforeClass + public static void beforeAll() { + try { + seq = new IndexedFastaSequenceFile(new File(seqLocation + "/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta")); + } catch (FileNotFoundException e) { + fail("Unexpected Exception" + e.getLocalizedMessage()); + } + GenomeLocParser.setupRefContigOrdering(seq); + } + + /** + * there's a read starting at chr1:1108664, make our interval go from chr1 to 1108664, apply the iterator, + * and we shouldn't see the read. + */ + @Test + public void testReadAtEndOfInterval() { + int countOfReads = 0; + SAMFileReader reader = new SAMFileReader(bam, true); + final int size = 1108664; + reader.setValidationStringency(SAMFileReader.ValidationStringency.LENIENT); + + GenomeLoc last = GenomeLocParser.createGenomeLoc("chr1", 1, size); + Iterator i = new PlusOneFixIterator(last, StingSAMIteratorAdapter.adapt(null, reader.queryOverlapping("chr1", 1, size))); + //Iterator i = reader.queryOverlapping("chr1", 1, size); + while (i.hasNext()) { + SAMRecord rec = i.next(); + countOfReads++; + } + Assert.assertEquals(2, countOfReads); + } + + /** + * there are 4296 reads in chr1, make sure we see them all. + * + * chr1 length = 247249719 (and no reads start at the last position + */ + @Test + public void testAllReads() { + int countOfReads = 0; + SAMFileReader reader = new SAMFileReader(bam, true); + reader.setValidationStringency(SAMFileReader.ValidationStringency.LENIENT); + final int size = 247249719; + GenomeLoc last = GenomeLocParser.createGenomeLoc("chr1", 1, size); + Iterator i = new PlusOneFixIterator(last, StingSAMIteratorAdapter.adapt(null, reader.queryOverlapping("chr1", 1, size))); + //Iterator i = reader.queryOverlapping("chr1", 1, size); + while (i.hasNext()) { + SAMRecord rec = i.next(); + countOfReads++; + } + Assert.assertEquals(885, countOfReads); + } + + /** + * there are 4296 reads in chr1, make sure we see them all. + * + * chr1 length = 247249719 (and no reads start at the last position + */ + @Test + public void testAllReadsSharded() { + int countOfReads = 0; + + final int size = 247249719; + int currentStop = 0; + int incr = 100000; + GenomeLoc last = GenomeLocParser.createGenomeLoc("chr1", 1, size); + while (currentStop < size) { + SAMFileReader reader = new SAMFileReader(bam, true); + reader.setValidationStringency(SAMFileReader.ValidationStringency.LENIENT); + int lastStop = (currentStop > 1) ? currentStop: 1; + if (currentStop + incr < size) + currentStop += incr; + else + currentStop = size; + Iterator i = new PlusOneFixIterator(last, StingSAMIteratorAdapter.adapt(null, reader.queryOverlapping("chr1", lastStop, currentStop))); + + while (i.hasNext()) { + SAMRecord rec = i.next(); + countOfReads++; + } + reader.close(); + } + Assert.assertEquals(885, countOfReads); + } +} + diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/fasta/BamToFastqIntegrationTest.java b/java/test/org/broadinstitute/sting/gatk/walkers/fasta/BamToFastqIntegrationTest.java index c46e76951..1d4ef2fad 100755 --- a/java/test/org/broadinstitute/sting/gatk/walkers/fasta/BamToFastqIntegrationTest.java +++ b/java/test/org/broadinstitute/sting/gatk/walkers/fasta/BamToFastqIntegrationTest.java @@ -12,13 +12,13 @@ public class BamToFastqIntegrationTest extends WalkerTest { WalkerTestSpec spec1 = new WalkerTestSpec( "-T BamToFastq -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,100-10,000,500;1:10,100,000-10,101,000;1:10,900,000-10,900,001 -o %s", 1, - Arrays.asList("f742731e17fba105c7daae0e4f80ca1d")); + Arrays.asList("2ec2ff5ac405099bf48186d2c7e5fabd")); executeTest("testBamToFasta", spec1); WalkerTestSpec spec2 = new WalkerTestSpec( "-T BamToFastq -reverse -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,100-10,000,500;1:10,100,000-10,101,000;1:10,900,000-10,900,001 -o %s", 1, - Arrays.asList("19a5418fdf7b53dac8badb67bb1e1b88")); + Arrays.asList("5a79484da43925b0e0461be28fdad07c")); executeTest("testBamToFastaReverse", spec2); } }