an iterator to fix the problem where read-based interval traversals are getting duplicate reads because reads span the two intervals.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1305 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
7c20be157c
commit
0b16253db3
|
|
@ -52,7 +52,6 @@ import java.util.List;
|
||||||
*/
|
*/
|
||||||
public class SAMDataSource implements SimpleDataSource {
|
public class SAMDataSource implements SimpleDataSource {
|
||||||
|
|
||||||
|
|
||||||
/** Backing support for reads. */
|
/** Backing support for reads. */
|
||||||
private final Reads reads;
|
private final Reads reads;
|
||||||
|
|
||||||
|
|
@ -63,7 +62,7 @@ public class SAMDataSource implements SimpleDataSource {
|
||||||
long readsTaken = 0;
|
long readsTaken = 0;
|
||||||
|
|
||||||
// our last genome loc position
|
// our last genome loc position
|
||||||
GenomeLoc lastReadPos = null;
|
protected GenomeLoc lastReadPos = null;
|
||||||
|
|
||||||
// do we take unmapped reads
|
// do we take unmapped reads
|
||||||
private boolean includeUnmappedReads = true;
|
private boolean includeUnmappedReads = true;
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,109 @@
|
||||||
|
package org.broadinstitute.sting.gatk.iterators;
|
||||||
|
|
||||||
|
import net.sf.samtools.SAMRecord;
|
||||||
|
import org.broadinstitute.sting.gatk.Reads;
|
||||||
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
|
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||||
|
|
||||||
|
import java.util.Iterator;
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @author aaron
|
||||||
|
*
|
||||||
|
* Class DuplicateDetectorIterator
|
||||||
|
*
|
||||||
|
* remove reads that overlap the passed in interval, yea!
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
public class IntervalOverlapIterator implements StingSAMIterator {
|
||||||
|
|
||||||
|
// our wrapped iterator
|
||||||
|
private final StingSAMIterator mIter;
|
||||||
|
private final boolean throwException;
|
||||||
|
|
||||||
|
// storage for the next record
|
||||||
|
private SAMRecord mNextRecord = null;
|
||||||
|
|
||||||
|
// the genomic location which we filter on
|
||||||
|
private final GenomeLoc mLoc;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a DuplicateDetectorIterator from another sam iterator
|
||||||
|
* @param iter something that implements StingSAMIterator
|
||||||
|
* @param blowUpOnDup if we find a dup, do we throw an exception (blow up) or do we drop it
|
||||||
|
*/
|
||||||
|
public IntervalOverlapIterator(StingSAMIterator iter, GenomeLoc filterLocation, boolean blowUpOnDup) {
|
||||||
|
this.mIter = iter;
|
||||||
|
this.throwException = blowUpOnDup;
|
||||||
|
this.mLoc = filterLocation;
|
||||||
|
if (iter.hasNext()) {
|
||||||
|
next();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Gets source information for the reads. Contains information about the original reads
|
||||||
|
* files, plus information about downsampling, etc.
|
||||||
|
*
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
public Reads getSourceInfo() {
|
||||||
|
return mIter.getSourceInfo();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* close this iterator
|
||||||
|
*/
|
||||||
|
public void close() {
|
||||||
|
if (mIter != null) mIter.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* do we have a next?
|
||||||
|
* @return true if yes, false if not
|
||||||
|
*/
|
||||||
|
public boolean hasNext() {
|
||||||
|
return (mNextRecord != null);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* get the next record
|
||||||
|
* @return a SAMRecord
|
||||||
|
*/
|
||||||
|
public SAMRecord next() {
|
||||||
|
SAMRecord ret = mNextRecord;
|
||||||
|
while (mIter.hasNext()) {
|
||||||
|
mNextRecord = mIter.next();
|
||||||
|
if (!isOverlaping(mNextRecord)) return ret;
|
||||||
|
}
|
||||||
|
mNextRecord = null;
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* not supported
|
||||||
|
*/
|
||||||
|
public void remove() {
|
||||||
|
throw new UnsupportedOperationException("You can't call remove, so, like, I guess please don't");
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* create an iterator out of the this type
|
||||||
|
* @return this!
|
||||||
|
*/
|
||||||
|
public Iterator<SAMRecord> iterator() {
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* determine if a read overlaps the specified interval that was passed in
|
||||||
|
* @param rec the read
|
||||||
|
* @return true if it overlaps, false otherwise
|
||||||
|
*/
|
||||||
|
private boolean isOverlaping(SAMRecord rec) {
|
||||||
|
return mLoc.overlapsP(GenomeLocParser.createGenomeLoc(rec));
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,80 @@
|
||||||
|
package org.broadinstitute.sting.gatk.iterators;
|
||||||
|
|
||||||
|
import net.sf.samtools.SAMFileReader;
|
||||||
|
import net.sf.samtools.SAMRecord;
|
||||||
|
import org.broadinstitute.sting.BaseTest;
|
||||||
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
|
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||||
|
import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile;
|
||||||
|
import org.junit.Assert;
|
||||||
|
import static org.junit.Assert.fail;
|
||||||
|
import org.junit.BeforeClass;
|
||||||
|
import org.junit.Test;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.FileNotFoundException;
|
||||||
|
import java.util.Iterator;
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @author aaron
|
||||||
|
* <p/>
|
||||||
|
* Class DuplicateDetectorIteratorTest
|
||||||
|
* <p/>
|
||||||
|
* test the DuplicateDetectorIterator class.
|
||||||
|
*/
|
||||||
|
public class IntervalOverlapIteratorTest extends BaseTest {
|
||||||
|
private final File bam = new File("/humgen/gsa-scr1/GATK_Data/Validation_Data/index_test.bam");
|
||||||
|
private static IndexedFastaSequenceFile seq;
|
||||||
|
private int chromosomeOneReadCount = 885;
|
||||||
|
|
||||||
|
//GenomeLoc.setupRefContigOrdering(seq.getSequenceDictionary());
|
||||||
|
/**
|
||||||
|
* This function does the setup of our parser, before each method call.
|
||||||
|
* <p/>
|
||||||
|
* Called before every test case method.
|
||||||
|
*/
|
||||||
|
@BeforeClass
|
||||||
|
public static void beforeAll() {
|
||||||
|
try {
|
||||||
|
seq = new IndexedFastaSequenceFile(new File(seqLocation + "/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta"));
|
||||||
|
} catch (FileNotFoundException e) {
|
||||||
|
fail("Unexpected Exception" + e.getLocalizedMessage());
|
||||||
|
}
|
||||||
|
GenomeLocParser.setupRefContigOrdering(seq);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testOverlappingIntervals() {
|
||||||
|
int countOfReads = 0;
|
||||||
|
int seqLength = seq.getSequenceDictionary().getSequence("chr1").getSequenceLength();
|
||||||
|
GenomeLoc last = GenomeLocParser.createGenomeLoc("chr1", 1, 470535);
|
||||||
|
|
||||||
|
// first count the initial pile of reads
|
||||||
|
SAMFileReader reader = new SAMFileReader(bam, true);
|
||||||
|
reader.setValidationStringency(SAMFileReader.ValidationStringency.LENIENT);
|
||||||
|
Iterator<SAMRecord> i = reader.queryOverlapping("chr1",1,470535);
|
||||||
|
GenomeLoc newLoc;
|
||||||
|
while (i.hasNext()) {
|
||||||
|
i.next();
|
||||||
|
countOfReads++;
|
||||||
|
}
|
||||||
|
reader.close();
|
||||||
|
while (last.getStart() < seq.getSequenceDictionary().getSequence("chr1").getSequenceLength()) {
|
||||||
|
reader = new SAMFileReader(bam, true);
|
||||||
|
long stop = (last.getStop() >= seqLength) ? seqLength : last.getStop() + 470535;
|
||||||
|
newLoc = GenomeLocParser.createGenomeLoc(last.getContigIndex(),last.getStart()+470535,stop);
|
||||||
|
reader.setValidationStringency(SAMFileReader.ValidationStringency.LENIENT);
|
||||||
|
i = reader.queryOverlapping(newLoc.getContig(),(int)newLoc.getStart(),(int)newLoc.getStop());
|
||||||
|
IntervalOverlapIterator iter = new IntervalOverlapIterator(StingSAMIteratorAdapter.adapt(null, i),last,false);
|
||||||
|
while(iter.hasNext()) {
|
||||||
|
countOfReads++;
|
||||||
|
iter.next();
|
||||||
|
}
|
||||||
|
last = newLoc;
|
||||||
|
reader.close();
|
||||||
|
|
||||||
|
}
|
||||||
|
Assert.assertEquals(chromosomeOneReadCount,countOfReads);
|
||||||
|
}
|
||||||
|
}
|
||||||
Loading…
Reference in New Issue