added changes to support alec toUnmappedRead seek. Huge improvements (orders of magnitude) in unmapped read performance.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1021 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
4f6d26849f
commit
6ee64c7e43
|
|
@ -1,7 +1,6 @@
|
|||
package org.broadinstitute.sting.gatk.datasources.simpleDataSources;
|
||||
|
||||
import net.sf.picard.sam.SamFileHeaderMerger;
|
||||
import net.sf.picard.util.PeekableIterator;
|
||||
import net.sf.samtools.SAMFileHeader;
|
||||
import net.sf.samtools.SAMFileReader;
|
||||
import net.sf.samtools.SAMReadGroupRecord;
|
||||
|
|
@ -21,24 +20,43 @@ import java.util.ArrayList;
|
|||
import java.util.List;
|
||||
import java.util.Iterator;
|
||||
|
||||
/*
|
||||
* Copyright (c) 2009 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/**
|
||||
* User: aaron
|
||||
* Date: Mar 26, 2009
|
||||
* Time: 2:36:16 PM
|
||||
* <p/>
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2009 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
* <p/>
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
||||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||||
*
|
||||
* Converts shards to SAM iterators over the specified region
|
||||
*/
|
||||
public class SAMDataSource implements SimpleDataSource {
|
||||
|
||||
|
||||
/** Backing support for reads. */
|
||||
private Reads reads = null;
|
||||
private final Reads reads;
|
||||
|
||||
/** our log, which we want to capture anything from this class */
|
||||
protected static Logger logger = Logger.getLogger(SAMDataSource.class);
|
||||
|
|
@ -62,7 +80,7 @@ public class SAMDataSource implements SimpleDataSource {
|
|||
/**
|
||||
* constructor, given sam files
|
||||
*
|
||||
* @param reads the list of sam files
|
||||
* @param reads the list of sam files
|
||||
* @param byReads are we a by reads traversal, or a loci traversal. We could delete this field
|
||||
* if we passed in iterGen, which would be a better (although more complicated for the
|
||||
* consumers of SAMDataSources).
|
||||
|
|
@ -79,11 +97,12 @@ public class SAMDataSource implements SimpleDataSource {
|
|||
throw new SimpleDataSourceLoadException("SAMDataSource: Unable to load file: " + smFile.getName());
|
||||
}
|
||||
}
|
||||
iteratorPool = new SAMIteratorPool(reads,byReads);
|
||||
iteratorPool = new SAMIteratorPool(reads, byReads);
|
||||
}
|
||||
|
||||
/**
|
||||
* For unit testing, add a custom iterator pool.
|
||||
*
|
||||
* @param iteratorPool Custom mock iterator pool.
|
||||
*/
|
||||
void setResourcePool( SAMIteratorPool iteratorPool ) {
|
||||
|
|
@ -100,7 +119,7 @@ public class SAMDataSource implements SimpleDataSource {
|
|||
*
|
||||
* @return an iterator for that region
|
||||
*/
|
||||
public StingSAMIterator seekLocus(GenomeLoc location) throws SimpleDataSourceLoadException {
|
||||
public StingSAMIterator seekLocus( GenomeLoc location ) throws SimpleDataSourceLoadException {
|
||||
return iteratorPool.iterator(location);
|
||||
}
|
||||
|
||||
|
|
@ -180,7 +199,7 @@ public class SAMDataSource implements SimpleDataSource {
|
|||
iter.close();
|
||||
}
|
||||
iter = iteratorPool.iterator(null);
|
||||
bound = toUnmappedReads(shard.getSize(), iter);
|
||||
bound = toUnmappedReads(shard.getSize(), (QueryIterator) iter);
|
||||
}
|
||||
if (bound == null) {
|
||||
shard.signalDone();
|
||||
|
|
@ -211,41 +230,24 @@ public class SAMDataSource implements SimpleDataSource {
|
|||
* @return the bounded iterator that you can use to get the intervaled reads from
|
||||
* @throws SimpleDataSourceLoadException
|
||||
*/
|
||||
BoundedReadIterator toUnmappedReads( long readCount, StingSAMIterator iter ) throws SimpleDataSourceLoadException {
|
||||
PeekableIterator<SAMRecord> peekable = new PeekableIterator<SAMRecord>(iter);
|
||||
BoundedReadIterator toUnmappedReads( long readCount, QueryIterator iter ) throws SimpleDataSourceLoadException {
|
||||
iter.queryUnmappedReads();
|
||||
|
||||
int count = 0;
|
||||
int cnt = 0;
|
||||
SAMRecord d = null;
|
||||
while (peekable.hasNext()) {
|
||||
d = peekable.peek();
|
||||
int x = d.getReferenceIndex();
|
||||
if (x < 0)
|
||||
// we have the magic read that starts the unmapped read segment!
|
||||
break;
|
||||
cnt++;
|
||||
peekable.next();
|
||||
}
|
||||
|
||||
// check to see what happened, did we run out of reads?
|
||||
if (!peekable.hasNext()) {
|
||||
return null;
|
||||
}
|
||||
|
||||
// now walk until we've taken the unmapped read count
|
||||
while (peekable.hasNext() && count < this.readsTaken) {
|
||||
peekable.next();
|
||||
while (iter.hasNext() && count < this.readsTaken) {
|
||||
iter.next();
|
||||
count++;
|
||||
}
|
||||
|
||||
// check to see what happened, did we run out of reads?
|
||||
if (!peekable.hasNext()) {
|
||||
if (!iter.hasNext()) {
|
||||
return null;
|
||||
}
|
||||
|
||||
// we're not out of unmapped reads, so increment our read cout
|
||||
this.readsTaken += readCount;
|
||||
return new BoundedReadIterator(StingSAMIteratorAdapter.adapt(reads, peekable), readCount);
|
||||
return new BoundedReadIterator(StingSAMIteratorAdapter.adapt(reads, iter), readCount);
|
||||
|
||||
}
|
||||
|
||||
|
|
@ -356,77 +358,104 @@ public class SAMDataSource implements SimpleDataSource {
|
|||
|
||||
}
|
||||
|
||||
class SAMIteratorPool extends ResourcePool<SamFileHeaderMerger,StingSAMIterator> {
|
||||
/**
|
||||
* Source information about the reads.
|
||||
*/
|
||||
class SAMIteratorPool extends ResourcePool<SamFileHeaderMerger, QueryIterator> {
|
||||
/** Source information about the reads. */
|
||||
protected Reads reads;
|
||||
|
||||
/**
|
||||
* Is this a by-reads traversal or a by-locus?
|
||||
*/
|
||||
/** Is this a by-reads traversal or a by-locus? */
|
||||
protected boolean byReads;
|
||||
|
||||
/**
|
||||
* File header for the combined file.
|
||||
*/
|
||||
/** File header for the combined file. */
|
||||
protected SAMFileHeader header;
|
||||
|
||||
/** our log, which we want to capture anything from this class */
|
||||
protected static Logger logger = Logger.getLogger(SAMIteratorPool.class);
|
||||
protected static Logger logger = Logger.getLogger(SAMIteratorPool.class);
|
||||
|
||||
public SAMIteratorPool( Reads reads, boolean byReads ) {
|
||||
this.reads = reads;
|
||||
this.byReads = byReads;
|
||||
|
||||
SamFileHeaderMerger merger = createNewResource( null );
|
||||
SamFileHeaderMerger merger = createNewResource(null);
|
||||
this.header = merger.getMergedHeader();
|
||||
// Add this resource to the pool.
|
||||
this.addNewResource( merger );
|
||||
this.addNewResource(merger);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the combined header for all files in the iterator pool.
|
||||
*/
|
||||
/** Get the combined header for all files in the iterator pool. */
|
||||
public SAMFileHeader getHeader() {
|
||||
return header;
|
||||
}
|
||||
|
||||
protected SamFileHeaderMerger selectBestExistingResource( GenomeLoc position, List<SamFileHeaderMerger> mergers) {
|
||||
if( mergers.size() == 0 )
|
||||
protected SamFileHeaderMerger selectBestExistingResource( GenomeLoc position, List<SamFileHeaderMerger> mergers ) {
|
||||
if (mergers.size() == 0)
|
||||
return null;
|
||||
return mergers.get(0);
|
||||
}
|
||||
|
||||
protected SamFileHeaderMerger createNewResource( GenomeLoc position ) {
|
||||
return createHeaderMerger( reads, SAMFileHeader.SortOrder.coordinate );
|
||||
return createHeaderMerger(reads, SAMFileHeader.SortOrder.coordinate);
|
||||
}
|
||||
|
||||
protected StingSAMIterator createIteratorFromResource( GenomeLoc loc, SamFileHeaderMerger headerMerger ) {
|
||||
protected QueryIterator createIteratorFromResource( GenomeLoc loc, SamFileHeaderMerger headerMerger ) {
|
||||
final MergingSamRecordIterator2 iterator = new MergingSamRecordIterator2(headerMerger, reads);
|
||||
|
||||
if( loc != null ) {
|
||||
if (loc != null) {
|
||||
if (byReads)
|
||||
iterator.queryContained(loc.getContig(), (int) loc.getStart(), (int) loc.getStop());
|
||||
else
|
||||
iterator.queryOverlapping(loc.getContig(), (int) loc.getStart(), (int) loc.getStop());
|
||||
}
|
||||
|
||||
return new StingSAMIterator() {
|
||||
public Reads getSourceInfo() { return reads; }
|
||||
return new QueryIterator() {
|
||||
public Reads getSourceInfo() {
|
||||
return reads;
|
||||
}
|
||||
|
||||
public void close() {
|
||||
iterator.close();
|
||||
release(this);
|
||||
}
|
||||
public Iterator<SAMRecord> iterator() { return this; }
|
||||
public boolean hasNext() { return iterator.hasNext(); }
|
||||
public SAMRecord next() { return iterator.next(); }
|
||||
public void remove() { throw new UnsupportedOperationException("Can't remove from a StingSAMIterator"); }
|
||||
|
||||
public Iterator<SAMRecord> iterator() {
|
||||
return this;
|
||||
}
|
||||
|
||||
public boolean hasNext() {
|
||||
return iterator.hasNext();
|
||||
}
|
||||
|
||||
public SAMRecord next() {
|
||||
return iterator.next();
|
||||
}
|
||||
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException("Can't remove from a StingSAMIterator");
|
||||
}
|
||||
|
||||
public SAMRecord peek() {
|
||||
return iterator.peek();
|
||||
}
|
||||
|
||||
public void queryOverlapping( String contig, int start, int stop ) {
|
||||
iterator.queryOverlapping(contig, start, stop);
|
||||
}
|
||||
|
||||
public void query( String contig, int start, int stop, boolean contained ) {
|
||||
iterator.query(contig, start, stop, contained);
|
||||
}
|
||||
|
||||
public void queryUnmappedReads() {
|
||||
iterator.queryUnmappedReads();
|
||||
}
|
||||
|
||||
public void queryContained( String contig, int start, int stop ) {
|
||||
iterator.queryContained(contig, start, stop);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
protected void closeResource( SamFileHeaderMerger resource ) {
|
||||
for( SAMFileReader reader: resource.getReaders() )
|
||||
for (SAMFileReader reader : resource.getReaders())
|
||||
reader.close();
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -34,6 +34,7 @@ import net.sf.samtools.util.CloseableIterator;
|
|||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.gatk.Reads;
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
|
||||
import java.lang.reflect.Constructor;
|
||||
import java.util.Comparator;
|
||||
|
|
@ -46,7 +47,7 @@ import java.util.PriorityQueue;
|
|||
* iterable stream. The underlying iterators/files must all have the same sort order unless
|
||||
* the requested output format is unsorted, in which case any combination is valid.
|
||||
*/
|
||||
public class MergingSamRecordIterator2 implements CloseableIterator<SAMRecord>, Iterable<SAMRecord>, PeekingStingIterator {
|
||||
public class MergingSamRecordIterator2 implements CloseableIterator<SAMRecord>, Iterable<SAMRecord>, QueryIterator {
|
||||
protected PriorityQueue<ComparableSamRecordIterator> pq = null;
|
||||
protected final SamFileHeaderMerger samHeaderMerger;
|
||||
protected final SAMFileHeader.SortOrder sortOrder;
|
||||
|
|
@ -54,6 +55,7 @@ public class MergingSamRecordIterator2 implements CloseableIterator<SAMRecord>,
|
|||
private SAMRecord mNextRecord;
|
||||
protected boolean initialized = false;
|
||||
protected final Reads reads;
|
||||
protected boolean warnedUserAboutSortOrder = false; // so we only warn the user once
|
||||
|
||||
/**
|
||||
* Constructs a new merging iterator with the same set of readers and sort order as
|
||||
|
|
@ -62,7 +64,7 @@ public class MergingSamRecordIterator2 implements CloseableIterator<SAMRecord>,
|
|||
* @param headerMerger the header to merge
|
||||
* @param reads the reads pile
|
||||
*/
|
||||
public MergingSamRecordIterator2(final SamFileHeaderMerger headerMerger, Reads reads) {
|
||||
public MergingSamRecordIterator2( final SamFileHeaderMerger headerMerger, Reads reads ) {
|
||||
this.samHeaderMerger = headerMerger;
|
||||
this.reads = reads;
|
||||
this.sortOrder = headerMerger.getMergedHeader().getSortOrder();
|
||||
|
|
@ -90,11 +92,24 @@ public class MergingSamRecordIterator2 implements CloseableIterator<SAMRecord>,
|
|||
|
||||
}
|
||||
|
||||
private void checkSortOrder(SAMFileReader reader) {
|
||||
/**
|
||||
* verify the sort order
|
||||
*
|
||||
* @param reader the reader to check
|
||||
*/
|
||||
private void checkSortOrder( SAMFileReader reader ) {
|
||||
if (this.sortOrder != SAMFileHeader.SortOrder.unsorted && reader.getFileHeader().getSortOrder() != this.sortOrder) {
|
||||
throw new PicardException("Files are not compatible with sort order: " + reader.getFileHeader().getSortOrder() +
|
||||
" vrs " + this.sortOrder + ". Make sure that the SO flag in your bam file is set (The reader attribute for sort order equals "
|
||||
+ reader.getFileHeader().getAttribute("SO") + " in this case).");
|
||||
if (reads.getSafetyChecking()) {
|
||||
throw new PicardException("Files are not compatible with sort order: " + reader.getFileHeader().getSortOrder() +
|
||||
" vrs " + this.sortOrder + ". Make sure that the SO flag in your bam file is set (The reader attribute for sort order equals "
|
||||
+ reader.getFileHeader().getAttribute("SO") + " in this case).");
|
||||
} else if(!warnedUserAboutSortOrder) {
|
||||
warnedUserAboutSortOrder = true;
|
||||
Utils.warnUser("Files are not compatible with sort order: " + reader.getFileHeader().getSortOrder() +
|
||||
" vrs " + this.sortOrder + ". Make sure that the SO flag in your bam file is set (The reader attribute for sort order equals "
|
||||
+ reader.getFileHeader().getAttribute("SO") + " in this case).");
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -102,7 +117,7 @@ public class MergingSamRecordIterator2 implements CloseableIterator<SAMRecord>,
|
|||
return true;
|
||||
}
|
||||
|
||||
public void queryOverlapping(final String contig, final int start, final int stop) {
|
||||
public void queryOverlapping( final String contig, final int start, final int stop ) {
|
||||
if (initialized) {
|
||||
throw new IllegalStateException("You cannot double initialize a MergingSamRecordIterator2");
|
||||
}
|
||||
|
|
@ -118,7 +133,7 @@ public class MergingSamRecordIterator2 implements CloseableIterator<SAMRecord>,
|
|||
|
||||
}
|
||||
|
||||
public void query(final String contig, final int start, final int stop, final boolean contained) {
|
||||
public void query( final String contig, final int start, final int stop, final boolean contained ) {
|
||||
if (initialized) {
|
||||
throw new IllegalStateException("You cannot double initialize a MergingSamRecordIterator2");
|
||||
}
|
||||
|
|
@ -133,7 +148,22 @@ public class MergingSamRecordIterator2 implements CloseableIterator<SAMRecord>,
|
|||
|
||||
}
|
||||
|
||||
public void queryContained(final String contig, final int start, final int stop) {
|
||||
public void queryUnmappedReads() {
|
||||
if (initialized) {
|
||||
throw new IllegalStateException("You cannot double initialize a MergingSamRecordIterator2");
|
||||
}
|
||||
final SAMRecordComparator comparator = getComparator();
|
||||
for (final SAMFileReader reader : samHeaderMerger.getReaders()) {
|
||||
Iterator<SAMRecord> recordIter = reader.queryUnmapped();
|
||||
final ComparableSamRecordIterator iterator = new ComparableSamRecordIterator(reader, recordIter, comparator);
|
||||
addIfNotEmpty(iterator);
|
||||
}
|
||||
setInitialized();
|
||||
|
||||
}
|
||||
|
||||
|
||||
public void queryContained( final String contig, final int start, final int stop ) {
|
||||
if (initialized) {
|
||||
throw new IllegalStateException("You cannot double initialize a MergingSamRecordIterator2");
|
||||
}
|
||||
|
|
@ -237,7 +267,7 @@ public class MergingSamRecordIterator2 implements CloseableIterator<SAMRecord>,
|
|||
*
|
||||
* @param iterator the iterator to add
|
||||
*/
|
||||
protected void addIfNotEmpty(final ComparableSamRecordIterator iterator) {
|
||||
protected void addIfNotEmpty( final ComparableSamRecordIterator iterator ) {
|
||||
//System.out.printf("Adding %s %s %d%n", iterator.peek().getReadName(), iterator.peek().getReferenceName(), iterator.peek().getAlignmentStart());
|
||||
if (iterator.hasNext()) {
|
||||
pq.offer(iterator);
|
||||
|
|
@ -262,11 +292,11 @@ public class MergingSamRecordIterator2 implements CloseableIterator<SAMRecord>,
|
|||
// For unsorted build a fake comparator that compares based on object ID
|
||||
if (this.sortOrder == SAMFileHeader.SortOrder.unsorted) {
|
||||
return new SAMRecordComparator() {
|
||||
public int fileOrderCompare(final SAMRecord lhs, final SAMRecord rhs) {
|
||||
public int fileOrderCompare( final SAMRecord lhs, final SAMRecord rhs ) {
|
||||
return System.identityHashCode(lhs) - System.identityHashCode(rhs);
|
||||
}
|
||||
|
||||
public int compare(final SAMRecord lhs, final SAMRecord rhs) {
|
||||
public int compare( final SAMRecord lhs, final SAMRecord rhs ) {
|
||||
return fileOrderCompare(lhs, rhs);
|
||||
}
|
||||
};
|
||||
|
|
@ -345,13 +375,13 @@ class ComparableSamRecordIterator extends PeekableIterator<SAMRecord> implements
|
|||
* @param sam the SAM file to read records from
|
||||
* @param comparator the Comparator to use to provide ordering fo SAMRecords
|
||||
*/
|
||||
public ComparableSamRecordIterator(final SAMFileReader sam, final Comparator<SAMRecord> comparator) {
|
||||
public ComparableSamRecordIterator( final SAMFileReader sam, final Comparator<SAMRecord> comparator ) {
|
||||
super(sam.iterator());
|
||||
this.reader = sam;
|
||||
this.comparator = comparator;
|
||||
}
|
||||
|
||||
public ComparableSamRecordIterator(final SAMFileReader sam, Iterator<SAMRecord> iterator, final Comparator<SAMRecord> comparator) {
|
||||
public ComparableSamRecordIterator( final SAMFileReader sam, Iterator<SAMRecord> iterator, final Comparator<SAMRecord> comparator ) {
|
||||
super(iterator); // use the provided iterator
|
||||
this.reader = sam;
|
||||
this.comparator = comparator;
|
||||
|
|
@ -381,7 +411,7 @@ class ComparableSamRecordIterator extends PeekableIterator<SAMRecord> implements
|
|||
*
|
||||
* @return a negative, 0 or positive number as described in the Comparator interface
|
||||
*/
|
||||
public int compareTo(final ComparableSamRecordIterator that) {
|
||||
public int compareTo( final ComparableSamRecordIterator that ) {
|
||||
if (this.comparator.getClass() != that.comparator.getClass()) {
|
||||
throw new IllegalStateException("Attempt to compare two ComparableSAMRecordIterators that " +
|
||||
"have different orderings internally");
|
||||
|
|
|
|||
|
|
@ -1,6 +1,5 @@
|
|||
package org.broadinstitute.sting.gatk.iterators;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
|
||||
/*
|
||||
* Copyright (c) 2009 The Broad Institute
|
||||
|
|
@ -30,14 +29,12 @@ import net.sf.samtools.SAMRecord;
|
|||
/**
|
||||
* @author aaron
|
||||
* <p/>
|
||||
* Interface Peekable
|
||||
* Class PeekingIterator
|
||||
* <p/>
|
||||
* This interface indicates that
|
||||
* a peekable interface, that requires a peek() method
|
||||
*/
|
||||
public interface PeekingStingIterator extends StingSAMIterator {
|
||||
/**
|
||||
* peek, given the specified type
|
||||
* @return
|
||||
*/
|
||||
SAMRecord peek();
|
||||
public interface PeekingIterator<T> {
|
||||
|
||||
/** @return returns a peeked value */
|
||||
public T peek();
|
||||
}
|
||||
|
|
@ -0,0 +1,48 @@
|
|||
package org.broadinstitute.sting.gatk.iterators;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import net.sf.samtools.SAMRecordComparator;
|
||||
import net.sf.samtools.SAMFileReader;
|
||||
|
||||
import java.util.Iterator;
|
||||
|
||||
/*
|
||||
* Copyright (c) 2009 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/**
|
||||
* @author aaron
|
||||
*
|
||||
* This interface indicates that the iterator is query able
|
||||
*/
|
||||
public interface QueryIterator extends StingSAMIterator, PeekingIterator<SAMRecord> {
|
||||
|
||||
/**
|
||||
* The required methods to query able
|
||||
**/
|
||||
public void queryOverlapping(final String contig, final int start, final int stop);
|
||||
public void query(final String contig, final int start, final int stop, final boolean contained);
|
||||
public void queryUnmappedReads();
|
||||
public void queryContained(final String contig, final int start, final int stop);
|
||||
}
|
||||
|
|
@ -50,13 +50,13 @@ import java.math.BigInteger;
|
|||
*
|
||||
* A descriptions should go here. Blame aaron if it's missing.
|
||||
*/
|
||||
public class ReadValidationWalker extends ReadWalker<SAMRecord, Integer> {
|
||||
public class ReadValidationWalker extends ReadWalker<SAMRecord, SAMRecord> {
|
||||
|
||||
// our MD5 sum
|
||||
private MessageDigest m;
|
||||
|
||||
// private list of md5sums
|
||||
private final List<BigInteger> list = new ArrayList<BigInteger>();
|
||||
private final List<String> list = new ArrayList<String>();
|
||||
|
||||
/**
|
||||
* The initialize function.
|
||||
|
|
@ -94,8 +94,8 @@ public class ReadValidationWalker extends ReadWalker<SAMRecord, Integer> {
|
|||
* bam file, if it was specified on the command line
|
||||
* @return SAMFileWriter, set to the BAM output file if the command line option was set, null otherwise
|
||||
*/
|
||||
public Integer reduceInit() {
|
||||
return new Integer(0);
|
||||
public SAMRecord reduceInit() {
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -104,31 +104,16 @@ public class ReadValidationWalker extends ReadWalker<SAMRecord, Integer> {
|
|||
* @param output the output source
|
||||
* @return the SAMFileWriter, so that the next reduce can emit to the same source
|
||||
*/
|
||||
public Integer reduce( SAMRecord read, Integer output ) {
|
||||
byte[] ray = new byte[read.getReadName().length() + 5];
|
||||
int x = 0;
|
||||
for (char c: read.getReadName().toCharArray()) {
|
||||
ray[x] = (byte)c;
|
||||
//System.err.println("adding " + c + " to pos " + x);
|
||||
x++;
|
||||
public SAMRecord reduce( SAMRecord read, SAMRecord output ) {
|
||||
if (output == null)
|
||||
return read;
|
||||
if ((read.getReferenceIndex() == output.getReferenceIndex()) && (read.getAlignmentStart() < output.getAlignmentStart())) {
|
||||
System.err.println("saw the read " + read.getReadName() + " duplicated, old alignment = " + output.getAlignmentStart());
|
||||
}
|
||||
//System.err.println(read.getReadName() + " name, alignment = " + read.getAlignmentStart());
|
||||
int y = 0;
|
||||
for (;y < 4; y++) {
|
||||
ray[x+y] = (byte)((read.getAlignmentStart() >> y * 8) & 0x000f);
|
||||
else if (read.getReferenceIndex() != output.getReferenceIndex()){
|
||||
System.err.println("Switching Chromo");
|
||||
}
|
||||
ray[x+y] = read.getSecondOfPairFlag() ? (byte)1 : (byte)0;
|
||||
BigInteger bigInt = new BigInteger(m.digest(ray));
|
||||
//System.err.println(bigInt.toString());
|
||||
m.reset();
|
||||
if (this.list.contains(bigInt)) {
|
||||
throw new StingException("Seen Read: " + bigInt + "-> " + read.getReadName() + " before (list size = " + list.size() + ")");
|
||||
}
|
||||
if (read.getAlignmentStart() < output) {
|
||||
throw new StingException("Seen Read " + read.getReadName() + " has alignment of " + read.getAlignmentStart() + " before (list size = " + output + ")");
|
||||
}
|
||||
list.add(bigInt);
|
||||
return read.getAlignmentStart();
|
||||
return read;
|
||||
}
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -1,9 +1,12 @@
|
|||
package org.broadinstitute.sting.utils.sam;
|
||||
|
||||
import org.broadinstitute.sting.gatk.iterators.PeekingStingIterator;
|
||||
import org.broadinstitute.sting.gatk.iterators.QueryIterator;
|
||||
import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
|
||||
import org.broadinstitute.sting.gatk.Reads;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import net.sf.samtools.SAMFileHeader;
|
||||
import net.sf.samtools.util.CloseableIterator;
|
||||
import net.sf.picard.util.PeekableIterator;
|
||||
|
||||
import java.util.Iterator;
|
||||
|
||||
|
|
@ -34,7 +37,7 @@ import java.util.Iterator;
|
|||
*/
|
||||
|
||||
/** this fake iterator allows us to look at how specific piles of reads are handled */
|
||||
public class ArtificialSAMIterator implements PeekingStingIterator {
|
||||
public class ArtificialSAMIterator implements StingSAMIterator {
|
||||
|
||||
|
||||
protected int currentChromo = 0;
|
||||
|
|
|
|||
|
|
@ -7,6 +7,7 @@ import net.sf.samtools.SAMRecord;
|
|||
import java.util.List;
|
||||
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
import org.broadinstitute.sting.gatk.iterators.QueryIterator;
|
||||
|
||||
|
||||
/*
|
||||
|
|
@ -41,7 +42,7 @@ import org.broadinstitute.sting.utils.StingException;
|
|||
* to test out classes that use specific itervals. The reads returned will
|
||||
* all lie in order in the specified interval.
|
||||
*/
|
||||
public class ArtificialSAMQueryIterator extends ArtificialSAMIterator {
|
||||
public class ArtificialSAMQueryIterator extends ArtificialSAMIterator implements QueryIterator {
|
||||
|
||||
// get the next positon
|
||||
protected int finalPos = 0;
|
||||
|
|
@ -88,6 +89,39 @@ public class ArtificialSAMQueryIterator extends ArtificialSAMIterator {
|
|||
initialize(contig, start, stop);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void query( String contig, int start, int stop, boolean contained ) {
|
||||
if (contained)
|
||||
queryContained(contig, start, stop);
|
||||
else
|
||||
queryOverlapping(contig, start, stop);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void queryUnmappedReads() {
|
||||
initializeUnmapped();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* initialize the iterator to an unmapped read position
|
||||
*/
|
||||
public void initializeUnmapped() {
|
||||
ensureUntouched();
|
||||
while (super.hasNext() && this.peek().getReferenceIndex() >= 0) {
|
||||
super.next();
|
||||
}
|
||||
// sanity check that we have an actual matching read next
|
||||
SAMRecord rec = this.peek();
|
||||
if (rec == null) {
|
||||
throw new StingException("The next read doesn't match");
|
||||
}
|
||||
// set the seeked variable to true
|
||||
seeked = true;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* initialize the query iterator
|
||||
|
|
@ -124,7 +158,7 @@ public class ArtificialSAMQueryIterator extends ArtificialSAMIterator {
|
|||
// sanity check that we have an actual matching read next
|
||||
SAMRecord rec = this.peek();
|
||||
if (!matches(rec)) {
|
||||
throw new StingException("The next read doesn't match");
|
||||
throw new StingException("The next read doesn't match");
|
||||
}
|
||||
// set the seeked variable to true
|
||||
seeked = true;
|
||||
|
|
@ -141,6 +175,11 @@ public class ArtificialSAMQueryIterator extends ArtificialSAMIterator {
|
|||
if (rec.getReferenceIndex() != this.contigIndex) {
|
||||
return false;
|
||||
}
|
||||
// if we have an unmapped read, matching the contig is good enough for us
|
||||
if (rec.getReferenceIndex() < 0) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (!overlapping) {
|
||||
// if the start or the end are somewhere within our range
|
||||
if (( rec.getAlignmentStart() >= startPos && rec.getAlignmentEnd() <= finalPos )) {
|
||||
|
|
|
|||
|
|
@ -5,8 +5,7 @@ import net.sf.samtools.*;
|
|||
import java.io.File;
|
||||
import java.util.*;
|
||||
|
||||
import org.broadinstitute.sting.gatk.iterators.PeekingStingIterator;
|
||||
import org.broadinstitute.sting.gatk.Reads;
|
||||
import org.broadinstitute.sting.gatk.iterators.QueryIterator;
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
|
||||
/**
|
||||
|
|
@ -103,6 +102,51 @@ public class ArtificialSAMUtils {
|
|||
return header;
|
||||
}
|
||||
|
||||
/**
|
||||
* setup a default read group for a SAMFileHeader
|
||||
*
|
||||
* @param header the header to set
|
||||
* @param readGroupID the read group ID tag
|
||||
* @param sampleName the sample name
|
||||
*
|
||||
* @return the adjusted SAMFileHeader
|
||||
*/
|
||||
public static SAMFileHeader createDefaultReadGroup( SAMFileHeader header, String readGroupID, String sampleName ) {
|
||||
SAMReadGroupRecord rec = new SAMReadGroupRecord(readGroupID);
|
||||
rec.setSample(sampleName);
|
||||
List<SAMReadGroupRecord> readGroups = new ArrayList<SAMReadGroupRecord>();
|
||||
readGroups.add(rec);
|
||||
header.setReadGroups(readGroups);
|
||||
return header;
|
||||
}
|
||||
|
||||
/**
|
||||
* setup read groups for the specified read groups and sample names
|
||||
*
|
||||
* @param header the header to set
|
||||
* @param readGroupIDs the read group ID tags
|
||||
* @param sampleNames the sample names
|
||||
*
|
||||
* @return the adjusted SAMFileHeader
|
||||
*/
|
||||
public static SAMFileHeader createEnumeratedReadGroups( SAMFileHeader header, List<String> readGroupIDs, List<String> sampleNames ) {
|
||||
if (readGroupIDs.size() != sampleNames.size()) {
|
||||
throw new StingException("read group count and sample name count must be the same");
|
||||
}
|
||||
|
||||
List<SAMReadGroupRecord> readGroups = new ArrayList<SAMReadGroupRecord>();
|
||||
|
||||
int x = 0;
|
||||
for (; x < readGroupIDs.size(); x++) {
|
||||
SAMReadGroupRecord rec = new SAMReadGroupRecord(readGroupIDs.get(x));
|
||||
rec.setSample(sampleNames.get(x));
|
||||
readGroups.add(rec);
|
||||
}
|
||||
header.setReadGroups(readGroups);
|
||||
return header;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Create an artificial read based on the parameters. The cigar string will be *M, where * is the length of the read
|
||||
*
|
||||
|
|
@ -115,7 +159,7 @@ public class ArtificialSAMUtils {
|
|||
* @return the artificial read
|
||||
*/
|
||||
public static SAMRecord createArtificialRead( SAMFileHeader header, String name, int refIndex, int alignmentStart, int length ) {
|
||||
if( alignmentStart == 0 )
|
||||
if (alignmentStart == 0)
|
||||
throw new StingException("Invalid alignment start for artificial read");
|
||||
SAMRecord record = new SAMRecord(header);
|
||||
record.setReadName(name);
|
||||
|
|
@ -128,20 +172,26 @@ public class ArtificialSAMUtils {
|
|||
return record;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* create an iterator containing the specified read piles
|
||||
* Create an artificial read based on the parameters. The cigar string will be *M, where * is the length of the read
|
||||
*
|
||||
* @param startingChr the chromosome (reference ID) to start from
|
||||
* @param endingChr the id to end with
|
||||
* @param readCount the number of reads per chromosome
|
||||
* @param header the SAM header to associate the read with
|
||||
* @param name the name of the read
|
||||
* @param refIndex the reference index, i.e. what chromosome to associate it with
|
||||
* @param alignmentStart where to start the alignment
|
||||
* @param bases the sequence of the read
|
||||
* @param qual the qualities of the read
|
||||
*
|
||||
* @return StingSAMIterator representing the specified amount of fake data
|
||||
* @return the artificial read
|
||||
*/
|
||||
public static PeekingStingIterator unmappedReadIterator( int startingChr, int endingChr, int readCount ) {
|
||||
SAMFileHeader header = createArtificialSamHeader(( endingChr - startingChr ) + 1, startingChr, readCount + DEFAULT_READ_LENGTH);
|
||||
|
||||
return new ArtificialSAMIterator(startingChr, endingChr, readCount, header);
|
||||
public static SAMRecord createArtificialRead( SAMFileHeader header, String name, int refIndex, int alignmentStart, byte[] bases, byte[] qual ) {
|
||||
if (bases.length != qual.length) {
|
||||
throw new StingException("Passed in read string is different length then the quality array");
|
||||
}
|
||||
SAMRecord rec = createArtificialRead(header, name, refIndex, alignmentStart, bases.length);
|
||||
rec.setReadBases(bases);
|
||||
rec.setBaseQualities(bases);
|
||||
return rec;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -150,14 +200,29 @@ public class ArtificialSAMUtils {
|
|||
* @param startingChr the chromosome (reference ID) to start from
|
||||
* @param endingChr the id to end with
|
||||
* @param readCount the number of reads per chromosome
|
||||
*
|
||||
* @return StingSAMIterator representing the specified amount of fake data
|
||||
*/
|
||||
public static QueryIterator unmappedReadIterator( int startingChr, int endingChr, int readCount ) {
|
||||
SAMFileHeader header = createArtificialSamHeader(( endingChr - startingChr ) + 1, startingChr, readCount + DEFAULT_READ_LENGTH);
|
||||
|
||||
return new ArtificialSAMQueryIterator(startingChr, endingChr, readCount, 0, header);
|
||||
}
|
||||
|
||||
/**
|
||||
* create an iterator containing the specified read piles
|
||||
*
|
||||
* @param startingChr the chromosome (reference ID) to start from
|
||||
* @param endingChr the id to end with
|
||||
* @param readCount the number of reads per chromosome
|
||||
* @param unmappedReadCount the count of unmapped reads to place at the end of the iterator, like in a sorted bam file
|
||||
*
|
||||
* @return StingSAMIterator representing the specified amount of fake data
|
||||
*/
|
||||
public static PeekingStingIterator unmappedReadIterator( int startingChr, int endingChr, int readCount, int unmappedReadCount ) {
|
||||
public static QueryIterator unmappedReadIterator( int startingChr, int endingChr, int readCount, int unmappedReadCount ) {
|
||||
SAMFileHeader header = createArtificialSamHeader(( endingChr - startingChr ) + 1, startingChr, readCount + DEFAULT_READ_LENGTH);
|
||||
|
||||
return new ArtificialSAMIterator(startingChr, endingChr, readCount, unmappedReadCount, header);
|
||||
return new ArtificialSAMQueryIterator(startingChr, endingChr, readCount, unmappedReadCount, header);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -171,16 +236,16 @@ public class ArtificialSAMUtils {
|
|||
*/
|
||||
public static ArtificialSAMQueryIterator queryReadIterator( int startingChr, int endingChr, int readCount ) {
|
||||
SAMFileHeader header = createArtificialSamHeader(( endingChr - startingChr ) + 1, startingChr, readCount + DEFAULT_READ_LENGTH);
|
||||
|
||||
|
||||
return new ArtificialSAMQueryIterator(startingChr, endingChr, readCount, 0, header);
|
||||
}
|
||||
|
||||
/**
|
||||
* create an ArtificialSAMQueryIterator containing the specified read piles
|
||||
*
|
||||
* @param startingChr the chromosome (reference ID) to start from
|
||||
* @param endingChr the id to end with
|
||||
* @param readCount the number of reads per chromosome
|
||||
* @param startingChr the chromosome (reference ID) to start from
|
||||
* @param endingChr the id to end with
|
||||
* @param readCount the number of reads per chromosome
|
||||
* @param unmappedReadCount the count of unmapped reads to place at the end of the iterator, like in a sorted bam file
|
||||
*
|
||||
* @return StingSAMIterator representing the specified amount of fake data
|
||||
|
|
|
|||
|
|
@ -3,7 +3,6 @@ package org.broadinstitute.sting.gatk.datasources.simpleDataSources;
|
|||
import static junit.framework.Assert.fail;
|
||||
import net.sf.samtools.SAMFileReader;
|
||||
import net.sf.samtools.SAMFileHeader;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.gatk.datasources.shards.ShardStrategy;
|
||||
import org.broadinstitute.sting.gatk.datasources.shards.ShardStrategyFactory;
|
||||
|
|
@ -84,7 +83,7 @@ public class SAMByReadsTest extends BaseTest {
|
|||
SAMDataSource data = new SAMDataSource(reads,true);
|
||||
for (int x = 0; x < 10; x++) {
|
||||
++iterations;
|
||||
PeekingStingIterator iter = ArtificialSAMUtils.unmappedReadIterator(1, 100, 10, 1000);
|
||||
QueryIterator iter = ArtificialSAMUtils.unmappedReadIterator(1, 100, 10, 1000);
|
||||
BoundedReadIterator ret = data.toUnmappedReads(100, iter);
|
||||
// count the reads we've gotten back
|
||||
if (ret == null) {
|
||||
|
|
@ -173,7 +172,7 @@ public class SAMByReadsTest extends BaseTest {
|
|||
while (shardStrategy.hasNext()) {
|
||||
|
||||
|
||||
BoundedReadIterator ret = (BoundedReadIterator)data.seek(shardStrategy.next());
|
||||
StingSAMIterator ret = data.seek(shardStrategy.next());
|
||||
assertTrue(ret != null);
|
||||
while (ret.hasNext()) {
|
||||
ret.next();
|
||||
|
|
@ -227,7 +226,7 @@ class ArtificialResourcePool extends SAMIteratorPool {
|
|||
}
|
||||
|
||||
@Override
|
||||
public StingSAMIterator iterator( GenomeLoc loc ) {
|
||||
public QueryIterator iterator( GenomeLoc loc ) {
|
||||
ArtificialSAMQueryIterator iter = ArtificialSAMUtils.queryReadIterator(1, 10, 100, 1000);
|
||||
if (loc != null) {
|
||||
iter.queryContained(loc.getContig(), (int)loc.getStart(), (int)loc.getStop());
|
||||
|
|
|
|||
|
|
@ -2,7 +2,7 @@ package org.broadinstitute.sting.utils.sam;
|
|||
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
|
||||
import org.broadinstitute.sting.gatk.iterators.PeekingStingIterator;
|
||||
import org.broadinstitute.sting.gatk.iterators.QueryIterator;
|
||||
import org.junit.Test;
|
||||
import static org.junit.Assert.assertEquals;
|
||||
import static org.junit.Assert.fail;
|
||||
|
|
@ -80,30 +80,5 @@ public class ArtificialSAMUtilsTest extends BaseTest {
|
|||
assertEquals(1000, count);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testPeeking() {
|
||||
PeekingStingIterator iter = ArtificialSAMUtils.unmappedReadIterator(1, 100, 100);
|
||||
int count = 0;
|
||||
while (iter.hasNext()) {
|
||||
int readCnt = ((ArtificialSAMIterator)(iter)).readsTaken();
|
||||
|
||||
// peek the record
|
||||
SAMRecord rec = iter.peek();
|
||||
assertTrue(rec.getReferenceIndex() >= 0);
|
||||
|
||||
// next the record
|
||||
SAMRecord rec2 = iter.next();
|
||||
assertTrue(rec2.getReadName() == rec.getReadName());
|
||||
assertTrue(rec2.getAlignmentStart() == rec.getAlignmentStart());
|
||||
|
||||
// find out how many reads we've taken now
|
||||
int readCnt2 = ((ArtificialSAMIterator)(iter)).readsTaken();
|
||||
|
||||
count++;
|
||||
if (count < 100*100) assertEquals(readCnt + 1, readCnt2);
|
||||
else assertEquals(readCnt, readCnt2);
|
||||
}
|
||||
assertEquals(100 * 100, count );
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue