a fix for a bug Andrey discovered: in read-based interval traversals we're dupplicating reads in rare cases. The problem was that to accomidate a bug in SAM JDK indexing, we were forced to add one to the stop of our QueryOverlapping() calls to ensure we always got all of the overlapping reads.

Added a PlusOneFixIterator that wraps other iterators, and eliminates reads that start outside of our intended interval (interval stop - 1).  Updated and checked BamToFastqIntegrationTest MD5 sums.



git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1976 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
aaron 2009-11-05 05:26:33 +00:00
parent 15e80aec3b
commit aacd72854f
5 changed files with 239 additions and 4 deletions

View File

@ -143,7 +143,7 @@ class MappedReadStreamPointer extends ReadStreamPointer {
// over a given interval would occasionally not pick up the last read in that interval.
mergingIterator.queryOverlapping( segment.locus.getContig(),
(int)segment.locus.getStart(),
(int)segment.locus.getStop()+1);
(int)segment.locus.getStop()+ PlusOneFixIterator.PLUS_ONE_FIX_CONSTANT);
return StingSAMIteratorAdapter.adapt(sourceInfo,mergingIterator);
}

View File

@ -172,7 +172,8 @@ public class SAMDataSource implements SimpleDataSource {
reads.getSupplementalFilters());
// add the new overlapping detection iterator, if we have a last interval and we're a read based shard
if (mLastInterval != null && shard.getShardType() == Shard.ShardType.READ_INTERVAL ) iterator = new IntervalOverlapIterator(iterator,mLastInterval,false);
if (mLastInterval != null && shard.getShardType() == Shard.ShardType.READ_INTERVAL )
iterator = new PlusOneFixIterator(shard.getGenomeLoc(),new IntervalOverlapIterator(iterator,mLastInterval,false));
mLastInterval = shard.getGenomeLoc();
} else {

View File

@ -0,0 +1,114 @@
package org.broadinstitute.sting.gatk.iterators;
import net.sf.samtools.SAMRecord;
import java.util.Iterator;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.gatk.Reads;
/**
* Created by IntelliJ IDEA.
* User: aaronmckenna
* Date: Nov 4, 2009
* Time: 10:41:02 PM
*/
public class PlusOneFixIterator implements StingSAMIterator {
/**
* this holds the value that we use to correct our query overlapping calls with,
* but more importantly it ties the ReadStreamPointer code to this so we don't loose
* track of this adjustment across the code.
*/
public static final Integer PLUS_ONE_FIX_CONSTANT = 1;
/**
* our interval region
*/
private final GenomeLoc mInterval;
/**
* our iterator
*/
private final StingSAMIterator mIterator;
/**
* our next record
*/
private SAMRecord mNextRecord;
/**
* This iterator filters all reads that have their start < PLUS_ONE_FIX_CONSTANT from the end of the
* interval:
* <p/>
* <p/>
* (this won't make sense in a non-fixed width font)
* --------------
* interval |
* ------------------------
* | good read|
* --------------------------------------------
* | overlap < PLUS_ONE_FIX_CONSTANT, we drop|
* -------------------------------------------
* <p/>
* The problem is that we had to put a +1 on our queryOverlapping() stop position value to SAMTools
* due to an indexing problem. Other iterators don't know about this adjustment though, so calculations
* like overlap fail. This iterator eliminates them before the read is seen by other iterators.
* <p/>
* create a plus one fix iterator, given:
*
* @param inteveral the interval we're checking
* @param iterator the iterator to draw reads from
*/
public PlusOneFixIterator(GenomeLoc inteveral, StingSAMIterator iterator) {
if (inteveral == null || iterator == null)
throw new StingException("Both parameters to PlusOneFixIterator have to != null");
// save off the iterator and the interval
mInterval = inteveral;
mIterator = iterator;
// pop off the first read
if (mIterator.hasNext()) {
next();
}
}
@Override
public boolean hasNext() {
return (mNextRecord != null);
}
@Override
public SAMRecord next() {
SAMRecord ret = mNextRecord;
while (mIterator.hasNext()) {
mNextRecord = mIterator.next();
if (!(mNextRecord.getAlignmentStart() > (mInterval.getStop() - PLUS_ONE_FIX_CONSTANT))) return ret;
}
mNextRecord = null;
return ret;
}
@Override
public void remove() {
throw new UnsupportedOperationException("REMOVE on PlusOneFixIterator is not supported");
}
@Override
public Reads getSourceInfo() {
return this.mIterator.getSourceInfo();
}
@Override
public void close() {
this.mIterator.close();
}
@Override
public Iterator<SAMRecord> iterator() {
return this;
}
}

View File

@ -0,0 +1,120 @@
package org.broadinstitute.sting.gatk.iterators;
import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.BaseTest;
import org.junit.BeforeClass;
import org.junit.Test;
import org.junit.Assert;
import static org.junit.Assert.fail;
import java.io.File;
import java.io.FileNotFoundException;
import java.util.Iterator;
import net.sf.samtools.SAMFileReader;
import net.sf.samtools.SAMRecord;
/**
* Created by IntelliJ IDEA.
* User: aaronmckenna
* Date: Nov 4, 2009
* Time: 11:02:24 PM
*/
public class PlusOneFixIteratorTest extends BaseTest {
private final File bam = new File("/humgen/gsa-scr1/GATK_Data/Validation_Data/index_test.bam");
private static IndexedFastaSequenceFile seq;
private int chromosomeOneReadCount = 885;
//GenomeLoc.setupRefContigOrdering(seq.getSequenceDictionary());
/**
* This function does the setup of our parser, before each method call.
* <p/>
* Called before every test case method.
*/
@BeforeClass
public static void beforeAll() {
try {
seq = new IndexedFastaSequenceFile(new File(seqLocation + "/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta"));
} catch (FileNotFoundException e) {
fail("Unexpected Exception" + e.getLocalizedMessage());
}
GenomeLocParser.setupRefContigOrdering(seq);
}
/**
* there's a read starting at chr1:1108664, make our interval go from chr1 to 1108664, apply the iterator,
* and we shouldn't see the read.
*/
@Test
public void testReadAtEndOfInterval() {
int countOfReads = 0;
SAMFileReader reader = new SAMFileReader(bam, true);
final int size = 1108664;
reader.setValidationStringency(SAMFileReader.ValidationStringency.LENIENT);
GenomeLoc last = GenomeLocParser.createGenomeLoc("chr1", 1, size);
Iterator<SAMRecord> i = new PlusOneFixIterator(last, StingSAMIteratorAdapter.adapt(null, reader.queryOverlapping("chr1", 1, size)));
//Iterator<SAMRecord> i = reader.queryOverlapping("chr1", 1, size);
while (i.hasNext()) {
SAMRecord rec = i.next();
countOfReads++;
}
Assert.assertEquals(2, countOfReads);
}
/**
* there are 4296 reads in chr1, make sure we see them all.
*
* chr1 length = 247249719 (and no reads start at the last position
*/
@Test
public void testAllReads() {
int countOfReads = 0;
SAMFileReader reader = new SAMFileReader(bam, true);
reader.setValidationStringency(SAMFileReader.ValidationStringency.LENIENT);
final int size = 247249719;
GenomeLoc last = GenomeLocParser.createGenomeLoc("chr1", 1, size);
Iterator<SAMRecord> i = new PlusOneFixIterator(last, StingSAMIteratorAdapter.adapt(null, reader.queryOverlapping("chr1", 1, size)));
//Iterator<SAMRecord> i = reader.queryOverlapping("chr1", 1, size);
while (i.hasNext()) {
SAMRecord rec = i.next();
countOfReads++;
}
Assert.assertEquals(885, countOfReads);
}
/**
* there are 4296 reads in chr1, make sure we see them all.
*
* chr1 length = 247249719 (and no reads start at the last position
*/
@Test
public void testAllReadsSharded() {
int countOfReads = 0;
final int size = 247249719;
int currentStop = 0;
int incr = 100000;
GenomeLoc last = GenomeLocParser.createGenomeLoc("chr1", 1, size);
while (currentStop < size) {
SAMFileReader reader = new SAMFileReader(bam, true);
reader.setValidationStringency(SAMFileReader.ValidationStringency.LENIENT);
int lastStop = (currentStop > 1) ? currentStop: 1;
if (currentStop + incr < size)
currentStop += incr;
else
currentStop = size;
Iterator<SAMRecord> i = new PlusOneFixIterator(last, StingSAMIteratorAdapter.adapt(null, reader.queryOverlapping("chr1", lastStop, currentStop)));
while (i.hasNext()) {
SAMRecord rec = i.next();
countOfReads++;
}
reader.close();
}
Assert.assertEquals(885, countOfReads);
}
}

View File

@ -12,13 +12,13 @@ public class BamToFastqIntegrationTest extends WalkerTest {
WalkerTestSpec spec1 = new WalkerTestSpec(
"-T BamToFastq -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,100-10,000,500;1:10,100,000-10,101,000;1:10,900,000-10,900,001 -o %s",
1,
Arrays.asList("f742731e17fba105c7daae0e4f80ca1d"));
Arrays.asList("2ec2ff5ac405099bf48186d2c7e5fabd"));
executeTest("testBamToFasta", spec1);
WalkerTestSpec spec2 = new WalkerTestSpec(
"-T BamToFastq -reverse -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,100-10,000,500;1:10,100,000-10,101,000;1:10,900,000-10,900,001 -o %s",
1,
Arrays.asList("19a5418fdf7b53dac8badb67bb1e1b88"));
Arrays.asList("5a79484da43925b0e0461be28fdad07c"));
executeTest("testBamToFastaReverse", spec2);
}
}