a fix for a bug Andrey discovered: in read-based interval traversals we're dupplicating reads in rare cases. The problem was that to accomidate a bug in SAM JDK indexing, we were forced to add one to the stop of our QueryOverlapping() calls to ensure we always got all of the overlapping reads.
Added a PlusOneFixIterator that wraps other iterators, and eliminates reads that start outside of our intended interval (interval stop - 1). Updated and checked BamToFastqIntegrationTest MD5 sums. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1976 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
15e80aec3b
commit
aacd72854f
|
|
@ -143,7 +143,7 @@ class MappedReadStreamPointer extends ReadStreamPointer {
|
|||
// over a given interval would occasionally not pick up the last read in that interval.
|
||||
mergingIterator.queryOverlapping( segment.locus.getContig(),
|
||||
(int)segment.locus.getStart(),
|
||||
(int)segment.locus.getStop()+1);
|
||||
(int)segment.locus.getStop()+ PlusOneFixIterator.PLUS_ONE_FIX_CONSTANT);
|
||||
|
||||
return StingSAMIteratorAdapter.adapt(sourceInfo,mergingIterator);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -172,7 +172,8 @@ public class SAMDataSource implements SimpleDataSource {
|
|||
reads.getSupplementalFilters());
|
||||
|
||||
// add the new overlapping detection iterator, if we have a last interval and we're a read based shard
|
||||
if (mLastInterval != null && shard.getShardType() == Shard.ShardType.READ_INTERVAL ) iterator = new IntervalOverlapIterator(iterator,mLastInterval,false);
|
||||
if (mLastInterval != null && shard.getShardType() == Shard.ShardType.READ_INTERVAL )
|
||||
iterator = new PlusOneFixIterator(shard.getGenomeLoc(),new IntervalOverlapIterator(iterator,mLastInterval,false));
|
||||
mLastInterval = shard.getGenomeLoc();
|
||||
} else {
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,114 @@
|
|||
package org.broadinstitute.sting.gatk.iterators;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
|
||||
import java.util.Iterator;
|
||||
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
import org.broadinstitute.sting.gatk.Reads;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: aaronmckenna
|
||||
* Date: Nov 4, 2009
|
||||
* Time: 10:41:02 PM
|
||||
*/
|
||||
public class PlusOneFixIterator implements StingSAMIterator {
|
||||
|
||||
/**
|
||||
* this holds the value that we use to correct our query overlapping calls with,
|
||||
* but more importantly it ties the ReadStreamPointer code to this so we don't loose
|
||||
* track of this adjustment across the code.
|
||||
*/
|
||||
public static final Integer PLUS_ONE_FIX_CONSTANT = 1;
|
||||
|
||||
/**
|
||||
* our interval region
|
||||
*/
|
||||
private final GenomeLoc mInterval;
|
||||
|
||||
/**
|
||||
* our iterator
|
||||
*/
|
||||
private final StingSAMIterator mIterator;
|
||||
|
||||
/**
|
||||
* our next record
|
||||
*/
|
||||
private SAMRecord mNextRecord;
|
||||
|
||||
/**
|
||||
* This iterator filters all reads that have their start < PLUS_ONE_FIX_CONSTANT from the end of the
|
||||
* interval:
|
||||
* <p/>
|
||||
* <p/>
|
||||
* (this won't make sense in a non-fixed width font)
|
||||
* --------------
|
||||
* interval |
|
||||
* ------------------------
|
||||
* | good read|
|
||||
* --------------------------------------------
|
||||
* | overlap < PLUS_ONE_FIX_CONSTANT, we drop|
|
||||
* -------------------------------------------
|
||||
* <p/>
|
||||
* The problem is that we had to put a +1 on our queryOverlapping() stop position value to SAMTools
|
||||
* due to an indexing problem. Other iterators don't know about this adjustment though, so calculations
|
||||
* like overlap fail. This iterator eliminates them before the read is seen by other iterators.
|
||||
* <p/>
|
||||
* create a plus one fix iterator, given:
|
||||
*
|
||||
* @param inteveral the interval we're checking
|
||||
* @param iterator the iterator to draw reads from
|
||||
*/
|
||||
public PlusOneFixIterator(GenomeLoc inteveral, StingSAMIterator iterator) {
|
||||
if (inteveral == null || iterator == null)
|
||||
throw new StingException("Both parameters to PlusOneFixIterator have to != null");
|
||||
|
||||
// save off the iterator and the interval
|
||||
mInterval = inteveral;
|
||||
mIterator = iterator;
|
||||
|
||||
// pop off the first read
|
||||
if (mIterator.hasNext()) {
|
||||
next();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
return (mNextRecord != null);
|
||||
}
|
||||
|
||||
@Override
|
||||
public SAMRecord next() {
|
||||
SAMRecord ret = mNextRecord;
|
||||
while (mIterator.hasNext()) {
|
||||
mNextRecord = mIterator.next();
|
||||
if (!(mNextRecord.getAlignmentStart() > (mInterval.getStop() - PLUS_ONE_FIX_CONSTANT))) return ret;
|
||||
}
|
||||
mNextRecord = null;
|
||||
return ret;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException("REMOVE on PlusOneFixIterator is not supported");
|
||||
}
|
||||
|
||||
@Override
|
||||
public Reads getSourceInfo() {
|
||||
return this.mIterator.getSourceInfo();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() {
|
||||
this.mIterator.close();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Iterator<SAMRecord> iterator() {
|
||||
return this;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,120 @@
|
|||
package org.broadinstitute.sting.gatk.iterators;
|
||||
|
||||
import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.junit.BeforeClass;
|
||||
import org.junit.Test;
|
||||
import org.junit.Assert;
|
||||
import static org.junit.Assert.fail;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.util.Iterator;
|
||||
|
||||
import net.sf.samtools.SAMFileReader;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: aaronmckenna
|
||||
* Date: Nov 4, 2009
|
||||
* Time: 11:02:24 PM
|
||||
*/
|
||||
public class PlusOneFixIteratorTest extends BaseTest {
|
||||
private final File bam = new File("/humgen/gsa-scr1/GATK_Data/Validation_Data/index_test.bam");
|
||||
private static IndexedFastaSequenceFile seq;
|
||||
private int chromosomeOneReadCount = 885;
|
||||
|
||||
//GenomeLoc.setupRefContigOrdering(seq.getSequenceDictionary());
|
||||
/**
|
||||
* This function does the setup of our parser, before each method call.
|
||||
* <p/>
|
||||
* Called before every test case method.
|
||||
*/
|
||||
@BeforeClass
|
||||
public static void beforeAll() {
|
||||
try {
|
||||
seq = new IndexedFastaSequenceFile(new File(seqLocation + "/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta"));
|
||||
} catch (FileNotFoundException e) {
|
||||
fail("Unexpected Exception" + e.getLocalizedMessage());
|
||||
}
|
||||
GenomeLocParser.setupRefContigOrdering(seq);
|
||||
}
|
||||
|
||||
/**
|
||||
* there's a read starting at chr1:1108664, make our interval go from chr1 to 1108664, apply the iterator,
|
||||
* and we shouldn't see the read.
|
||||
*/
|
||||
@Test
|
||||
public void testReadAtEndOfInterval() {
|
||||
int countOfReads = 0;
|
||||
SAMFileReader reader = new SAMFileReader(bam, true);
|
||||
final int size = 1108664;
|
||||
reader.setValidationStringency(SAMFileReader.ValidationStringency.LENIENT);
|
||||
|
||||
GenomeLoc last = GenomeLocParser.createGenomeLoc("chr1", 1, size);
|
||||
Iterator<SAMRecord> i = new PlusOneFixIterator(last, StingSAMIteratorAdapter.adapt(null, reader.queryOverlapping("chr1", 1, size)));
|
||||
//Iterator<SAMRecord> i = reader.queryOverlapping("chr1", 1, size);
|
||||
while (i.hasNext()) {
|
||||
SAMRecord rec = i.next();
|
||||
countOfReads++;
|
||||
}
|
||||
Assert.assertEquals(2, countOfReads);
|
||||
}
|
||||
|
||||
/**
|
||||
* there are 4296 reads in chr1, make sure we see them all.
|
||||
*
|
||||
* chr1 length = 247249719 (and no reads start at the last position
|
||||
*/
|
||||
@Test
|
||||
public void testAllReads() {
|
||||
int countOfReads = 0;
|
||||
SAMFileReader reader = new SAMFileReader(bam, true);
|
||||
reader.setValidationStringency(SAMFileReader.ValidationStringency.LENIENT);
|
||||
final int size = 247249719;
|
||||
GenomeLoc last = GenomeLocParser.createGenomeLoc("chr1", 1, size);
|
||||
Iterator<SAMRecord> i = new PlusOneFixIterator(last, StingSAMIteratorAdapter.adapt(null, reader.queryOverlapping("chr1", 1, size)));
|
||||
//Iterator<SAMRecord> i = reader.queryOverlapping("chr1", 1, size);
|
||||
while (i.hasNext()) {
|
||||
SAMRecord rec = i.next();
|
||||
countOfReads++;
|
||||
}
|
||||
Assert.assertEquals(885, countOfReads);
|
||||
}
|
||||
|
||||
/**
|
||||
* there are 4296 reads in chr1, make sure we see them all.
|
||||
*
|
||||
* chr1 length = 247249719 (and no reads start at the last position
|
||||
*/
|
||||
@Test
|
||||
public void testAllReadsSharded() {
|
||||
int countOfReads = 0;
|
||||
|
||||
final int size = 247249719;
|
||||
int currentStop = 0;
|
||||
int incr = 100000;
|
||||
GenomeLoc last = GenomeLocParser.createGenomeLoc("chr1", 1, size);
|
||||
while (currentStop < size) {
|
||||
SAMFileReader reader = new SAMFileReader(bam, true);
|
||||
reader.setValidationStringency(SAMFileReader.ValidationStringency.LENIENT);
|
||||
int lastStop = (currentStop > 1) ? currentStop: 1;
|
||||
if (currentStop + incr < size)
|
||||
currentStop += incr;
|
||||
else
|
||||
currentStop = size;
|
||||
Iterator<SAMRecord> i = new PlusOneFixIterator(last, StingSAMIteratorAdapter.adapt(null, reader.queryOverlapping("chr1", lastStop, currentStop)));
|
||||
|
||||
while (i.hasNext()) {
|
||||
SAMRecord rec = i.next();
|
||||
countOfReads++;
|
||||
}
|
||||
reader.close();
|
||||
}
|
||||
Assert.assertEquals(885, countOfReads);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -12,13 +12,13 @@ public class BamToFastqIntegrationTest extends WalkerTest {
|
|||
WalkerTestSpec spec1 = new WalkerTestSpec(
|
||||
"-T BamToFastq -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,100-10,000,500;1:10,100,000-10,101,000;1:10,900,000-10,900,001 -o %s",
|
||||
1,
|
||||
Arrays.asList("f742731e17fba105c7daae0e4f80ca1d"));
|
||||
Arrays.asList("2ec2ff5ac405099bf48186d2c7e5fabd"));
|
||||
executeTest("testBamToFasta", spec1);
|
||||
|
||||
WalkerTestSpec spec2 = new WalkerTestSpec(
|
||||
"-T BamToFastq -reverse -R /broad/1KG/reference/human_b36_both.fasta -I /humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,100-10,000,500;1:10,100,000-10,101,000;1:10,900,000-10,900,001 -o %s",
|
||||
1,
|
||||
Arrays.asList("19a5418fdf7b53dac8badb67bb1e1b88"));
|
||||
Arrays.asList("5a79484da43925b0e0461be28fdad07c"));
|
||||
executeTest("testBamToFastaReverse", spec2);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue