a fix for a bug Andrey discovered: in read-based interval traversals we're dupplicating reads in rare cases. The problem was that to accomidate a bug in SAM JDK indexing, we were forced to add one to the stop of our QueryOverlapping() calls to ensure we always got all of the overlapping reads.
Added a PlusOneFixIterator that wraps other iterators, and eliminates reads that start outside of our intended interval (interval stop - 1). Updated and checked BamToFastqIntegrationTest MD5 sums. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1976 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
15e80aec3b
commit
aacd72854f
|
|
@ -143,7 +143,7 @@ class MappedReadStreamPointer extends ReadStreamPointer {
|
||||||
// over a given interval would occasionally not pick up the last read in that interval.
|
// over a given interval would occasionally not pick up the last read in that interval.
|
||||||
mergingIterator.queryOverlapping( segment.locus.getContig(),
|
mergingIterator.queryOverlapping( segment.locus.getContig(),
|
||||||
(int)segment.locus.getStart(),
|
(int)segment.locus.getStart(),
|
||||||
(int)segment.locus.getStop()+1);
|
(int)segment.locus.getStop()+ PlusOneFixIterator.PLUS_ONE_FIX_CONSTANT);
|
||||||
|
|
||||||
return StingSAMIteratorAdapter.adapt(sourceInfo,mergingIterator);
|
return StingSAMIteratorAdapter.adapt(sourceInfo,mergingIterator);
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -172,7 +172,8 @@ public class SAMDataSource implements SimpleDataSource {
|
||||||
reads.getSupplementalFilters());
|
reads.getSupplementalFilters());
|
||||||
|
|
||||||
// add the new overlapping detection iterator, if we have a last interval and we're a read based shard
|
// add the new overlapping detection iterator, if we have a last interval and we're a read based shard
|
||||||
if (mLastInterval != null && shard.getShardType() == Shard.ShardType.READ_INTERVAL ) iterator = new IntervalOverlapIterator(iterator,mLastInterval,false);
|
if (mLastInterval != null && shard.getShardType() == Shard.ShardType.READ_INTERVAL )
|
||||||
|
iterator = new PlusOneFixIterator(shard.getGenomeLoc(),new IntervalOverlapIterator(iterator,mLastInterval,false));
|
||||||
mLastInterval = shard.getGenomeLoc();
|
mLastInterval = shard.getGenomeLoc();
|
||||||
} else {
|
} else {
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,114 @@
|
||||||
|
package org.broadinstitute.sting.gatk.iterators;
|
||||||
|
|
||||||
|
import net.sf.samtools.SAMRecord;
|
||||||
|
|
||||||
|
import java.util.Iterator;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
|
import org.broadinstitute.sting.utils.StingException;
|
||||||
|
import org.broadinstitute.sting.gatk.Reads;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Created by IntelliJ IDEA.
|
||||||
|
* User: aaronmckenna
|
||||||
|
* Date: Nov 4, 2009
|
||||||
|
* Time: 10:41:02 PM
|
||||||
|
*/
|
||||||
|
public class PlusOneFixIterator implements StingSAMIterator {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* this holds the value that we use to correct our query overlapping calls with,
|
||||||
|
* but more importantly it ties the ReadStreamPointer code to this so we don't loose
|
||||||
|
* track of this adjustment across the code.
|
||||||
|
*/
|
||||||
|
public static final Integer PLUS_ONE_FIX_CONSTANT = 1;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* our interval region
|
||||||
|
*/
|
||||||
|
private final GenomeLoc mInterval;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* our iterator
|
||||||
|
*/
|
||||||
|
private final StingSAMIterator mIterator;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* our next record
|
||||||
|
*/
|
||||||
|
private SAMRecord mNextRecord;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This iterator filters all reads that have their start < PLUS_ONE_FIX_CONSTANT from the end of the
|
||||||
|
* interval:
|
||||||
|
* <p/>
|
||||||
|
* <p/>
|
||||||
|
* (this won't make sense in a non-fixed width font)
|
||||||
|
* --------------
|
||||||
|
* interval |
|
||||||
|
* ------------------------
|
||||||
|
* | good read|
|
||||||
|
* --------------------------------------------
|
||||||
|
* | overlap < PLUS_ONE_FIX_CONSTANT, we drop|
|
||||||
|
* -------------------------------------------
|
||||||
|
* <p/>
|
||||||
|
* The problem is that we had to put a +1 on our queryOverlapping() stop position value to SAMTools
|
||||||
|
* due to an indexing problem. Other iterators don't know about this adjustment though, so calculations
|
||||||
|
* like overlap fail. This iterator eliminates them before the read is seen by other iterators.
|
||||||
|
* <p/>
|
||||||
|
* create a plus one fix iterator, given:
|
||||||
|
*
|
||||||
|
* @param inteveral the interval we're checking
|
||||||
|
* @param iterator the iterator to draw reads from
|
||||||
|
*/
|
||||||
|
public PlusOneFixIterator(GenomeLoc inteveral, StingSAMIterator iterator) {
|
||||||
|
if (inteveral == null || iterator == null)
|
||||||
|
throw new StingException("Both parameters to PlusOneFixIterator have to != null");
|
||||||
|
|
||||||
|
// save off the iterator and the interval
|
||||||
|
mInterval = inteveral;
|
||||||
|
mIterator = iterator;
|
||||||
|
|
||||||
|
// pop off the first read
|
||||||
|
if (mIterator.hasNext()) {
|
||||||
|
next();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean hasNext() {
|
||||||
|
return (mNextRecord != null);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public SAMRecord next() {
|
||||||
|
SAMRecord ret = mNextRecord;
|
||||||
|
while (mIterator.hasNext()) {
|
||||||
|
mNextRecord = mIterator.next();
|
||||||
|
if (!(mNextRecord.getAlignmentStart() > (mInterval.getStop() - PLUS_ONE_FIX_CONSTANT))) return ret;
|
||||||
|
}
|
||||||
|
mNextRecord = null;
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void remove() {
|
||||||
|
throw new UnsupportedOperationException("REMOVE on PlusOneFixIterator is not supported");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Reads getSourceInfo() {
|
||||||
|
return this.mIterator.getSourceInfo();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void close() {
|
||||||
|
this.mIterator.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Iterator<SAMRecord> iterator() {
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,120 @@
|
||||||
|
package org.broadinstitute.sting.gatk.iterators;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile;
|
||||||
|
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||||
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
|
import org.broadinstitute.sting.BaseTest;
|
||||||
|
import org.junit.BeforeClass;
|
||||||
|
import org.junit.Test;
|
||||||
|
import org.junit.Assert;
|
||||||
|
import static org.junit.Assert.fail;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.FileNotFoundException;
|
||||||
|
import java.util.Iterator;
|
||||||
|
|
||||||
|
import net.sf.samtools.SAMFileReader;
|
||||||
|
import net.sf.samtools.SAMRecord;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Created by IntelliJ IDEA.
|
||||||
|
* User: aaronmckenna
|
||||||
|
* Date: Nov 4, 2009
|
||||||
|
* Time: 11:02:24 PM
|
||||||
|
*/
|
||||||
|
public class PlusOneFixIteratorTest extends BaseTest {
|
||||||
|
private final File bam = new File("/humgen/gsa-scr1/GATK_Data/Validation_Data/index_test.bam");
|
||||||
|
private static IndexedFastaSequenceFile seq;
|
||||||
|
private int chromosomeOneReadCount = 885;
|
||||||
|
|
||||||
|
//GenomeLoc.setupRefContigOrdering(seq.getSequenceDictionary());
|
||||||
|
/**
|
||||||
|
* This function does the setup of our parser, before each method call.
|
||||||
|
* <p/>
|
||||||
|
* Called before every test case method.
|
||||||
|
*/
|
||||||
|
@BeforeClass
|
||||||
|
public static void beforeAll() {
|
||||||
|
try {
|
||||||
|
seq = new IndexedFastaSequenceFile(new File(seqLocation + "/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta"));
|
||||||
|
} catch (FileNotFoundException e) {
|
||||||
|
fail("Unexpected Exception" + e.getLocalizedMessage());
|
||||||
|
}
|
||||||
|
GenomeLocParser.setupRefContigOrdering(seq);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* there's a read starting at chr1:1108664, make our interval go from chr1 to 1108664, apply the iterator,
|
||||||
|
* and we shouldn't see the read.
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
public void testReadAtEndOfInterval() {
|
||||||
|
int countOfReads = 0;
|
||||||
|
SAMFileReader reader = new SAMFileReader(bam, true);
|
||||||
|
final int size = 1108664;
|
||||||
|
reader.setValidationStringency(SAMFileReader.ValidationStringency.LENIENT);
|
||||||
|
|
||||||
|
GenomeLoc last = GenomeLocParser.createGenomeLoc("chr1", 1, size);
|
||||||
|
Iterator<SAMRecord> i = new PlusOneFixIterator(last, StingSAMIteratorAdapter.adapt(null, reader.queryOverlapping("chr1", 1, size)));
|
||||||
|
//Iterator<SAMRecord> i = reader.queryOverlapping("chr1", 1, size);
|
||||||
|
while (i.hasNext()) {
|
||||||
|
SAMRecord rec = i.next();
|
||||||
|
countOfReads++;
|
||||||
|
}
|
||||||
|
Assert.assertEquals(2, countOfReads);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* there are 4296 reads in chr1, make sure we see them all.
|
||||||
|
*
|
||||||
|
* chr1 length = 247249719 (and no reads start at the last position
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
public void testAllReads() {
|
||||||
|
int countOfReads = 0;
|
||||||
|
SAMFileReader reader = new SAMFileReader(bam, true);
|
||||||
|
reader.setValidationStringency(SAMFileReader.ValidationStringency.LENIENT);
|
||||||
|
final int size = 247249719;
|
||||||
|
GenomeLoc last = GenomeLocParser.createGenomeLoc("chr1", 1, size);
|
||||||
|
Iterator<SAMRecord> i = new PlusOneFixIterator(last, StingSAMIteratorAdapter.adapt(null, reader.queryOverlapping("chr1", 1, size)));
|
||||||
|
//Iterator<SAMRecord> i = reader.queryOverlapping("chr1", 1, size);
|
||||||
|
while (i.hasNext()) {
|
||||||
|
SAMRecord rec = i.next();
|
||||||
|
countOfReads++;
|
||||||
|
}
|
||||||
|
Assert.assertEquals(885, countOfReads);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* there are 4296 reads in chr1, make sure we see them all.
|
||||||
|
*
|
||||||
|
* chr1 length = 247249719 (and no reads start at the last position
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
public void testAllReadsSharded() {
|
||||||
|
int countOfReads = 0;
|
||||||
|
|
||||||
|
final int size = 247249719;
|
||||||
|
int currentStop = 0;
|
||||||
|
int incr = 100000;
|
||||||
|
GenomeLoc last = GenomeLocParser.createGenomeLoc("chr1", 1, size);
|
||||||
|
while (currentStop < size) {
|
||||||
|
SAMFileReader reader = new SAMFileReader(bam, true);
|
||||||
|
reader.setValidationStringency(SAMFileReader.ValidationStringency.LENIENT);
|
||||||
|
int lastStop = (currentStop > 1) ? currentStop: 1;
|
||||||
|
if (currentStop + incr < size)
|
||||||
|
currentStop += incr;
|
||||||
|
else
|
||||||
|
currentStop = size;
|
||||||
|
Iterator<SAMRecord> i = new PlusOneFixIterator(last, StingSAMIteratorAdapter.adapt(null, reader.queryOverlapping("chr1", lastStop, currentStop)));
|
||||||
|
|
||||||
|
while (i.hasNext()) {
|
||||||
|
SAMRecord rec = i.next();
|
||||||
|
countOfReads++;
|
||||||
|
}
|
||||||
|
reader.close();
|
||||||
|
}
|
||||||
|
Assert.assertEquals(885, countOfReads);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
@ -12,13 +12,13 @@ public class BamToFastqIntegrationTest extends WalkerTest {
|
||||||
WalkerTestSpec spec1 = new WalkerTestSpec(
|
WalkerTestSpec spec1 = new WalkerTestSpec(
|
||||||
"-T BamToFastq -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,100-10,000,500;1:10,100,000-10,101,000;1:10,900,000-10,900,001 -o %s",
|
"-T BamToFastq -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,100-10,000,500;1:10,100,000-10,101,000;1:10,900,000-10,900,001 -o %s",
|
||||||
1,
|
1,
|
||||||
Arrays.asList("f742731e17fba105c7daae0e4f80ca1d"));
|
Arrays.asList("2ec2ff5ac405099bf48186d2c7e5fabd"));
|
||||||
executeTest("testBamToFasta", spec1);
|
executeTest("testBamToFasta", spec1);
|
||||||
|
|
||||||
WalkerTestSpec spec2 = new WalkerTestSpec(
|
WalkerTestSpec spec2 = new WalkerTestSpec(
|
||||||
"-T BamToFastq -reverse -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,100-10,000,500;1:10,100,000-10,101,000;1:10,900,000-10,900,001 -o %s",
|
"-T BamToFastq -reverse -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,100-10,000,500;1:10,100,000-10,101,000;1:10,900,000-10,900,001 -o %s",
|
||||||
1,
|
1,
|
||||||
Arrays.asList("19a5418fdf7b53dac8badb67bb1e1b88"));
|
Arrays.asList("5a79484da43925b0e0461be28fdad07c"));
|
||||||
executeTest("testBamToFastaReverse", spec2);
|
executeTest("testBamToFastaReverse", spec2);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue