added changes to support alec toUnmappedRead seek. Huge improvements (orders of magnitude) in unmapped read performance.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1021 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
aaron 2009-06-16 22:15:56 +00:00
parent 4f6d26849f
commit 6ee64c7e43
10 changed files with 340 additions and 170 deletions

View File

@ -1,7 +1,6 @@
package org.broadinstitute.sting.gatk.datasources.simpleDataSources; package org.broadinstitute.sting.gatk.datasources.simpleDataSources;
import net.sf.picard.sam.SamFileHeaderMerger; import net.sf.picard.sam.SamFileHeaderMerger;
import net.sf.picard.util.PeekableIterator;
import net.sf.samtools.SAMFileHeader; import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMFileReader; import net.sf.samtools.SAMFileReader;
import net.sf.samtools.SAMReadGroupRecord; import net.sf.samtools.SAMReadGroupRecord;
@ -21,24 +20,43 @@ import java.util.ArrayList;
import java.util.List; import java.util.List;
import java.util.Iterator; import java.util.Iterator;
/*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
/** /**
* User: aaron * User: aaron
* Date: Mar 26, 2009 * Date: Mar 26, 2009
* Time: 2:36:16 PM * Time: 2:36:16 PM
* <p/> *
* The Broad Institute * Converts shards to SAM iterators over the specified region
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
* <p/>
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*/ */
public class SAMDataSource implements SimpleDataSource { public class SAMDataSource implements SimpleDataSource {
/** Backing support for reads. */ /** Backing support for reads. */
private Reads reads = null; private final Reads reads;
/** our log, which we want to capture anything from this class */ /** our log, which we want to capture anything from this class */
protected static Logger logger = Logger.getLogger(SAMDataSource.class); protected static Logger logger = Logger.getLogger(SAMDataSource.class);
@ -62,7 +80,7 @@ public class SAMDataSource implements SimpleDataSource {
/** /**
* constructor, given sam files * constructor, given sam files
* *
* @param reads the list of sam files * @param reads the list of sam files
* @param byReads are we a by reads traversal, or a loci traversal. We could delete this field * @param byReads are we a by reads traversal, or a loci traversal. We could delete this field
* if we passed in iterGen, which would be a better (although more complicated for the * if we passed in iterGen, which would be a better (although more complicated for the
* consumers of SAMDataSources). * consumers of SAMDataSources).
@ -79,11 +97,12 @@ public class SAMDataSource implements SimpleDataSource {
throw new SimpleDataSourceLoadException("SAMDataSource: Unable to load file: " + smFile.getName()); throw new SimpleDataSourceLoadException("SAMDataSource: Unable to load file: " + smFile.getName());
} }
} }
iteratorPool = new SAMIteratorPool(reads,byReads); iteratorPool = new SAMIteratorPool(reads, byReads);
} }
/** /**
* For unit testing, add a custom iterator pool. * For unit testing, add a custom iterator pool.
*
* @param iteratorPool Custom mock iterator pool. * @param iteratorPool Custom mock iterator pool.
*/ */
void setResourcePool( SAMIteratorPool iteratorPool ) { void setResourcePool( SAMIteratorPool iteratorPool ) {
@ -100,7 +119,7 @@ public class SAMDataSource implements SimpleDataSource {
* *
* @return an iterator for that region * @return an iterator for that region
*/ */
public StingSAMIterator seekLocus(GenomeLoc location) throws SimpleDataSourceLoadException { public StingSAMIterator seekLocus( GenomeLoc location ) throws SimpleDataSourceLoadException {
return iteratorPool.iterator(location); return iteratorPool.iterator(location);
} }
@ -180,7 +199,7 @@ public class SAMDataSource implements SimpleDataSource {
iter.close(); iter.close();
} }
iter = iteratorPool.iterator(null); iter = iteratorPool.iterator(null);
bound = toUnmappedReads(shard.getSize(), iter); bound = toUnmappedReads(shard.getSize(), (QueryIterator) iter);
} }
if (bound == null) { if (bound == null) {
shard.signalDone(); shard.signalDone();
@ -211,41 +230,24 @@ public class SAMDataSource implements SimpleDataSource {
* @return the bounded iterator that you can use to get the intervaled reads from * @return the bounded iterator that you can use to get the intervaled reads from
* @throws SimpleDataSourceLoadException * @throws SimpleDataSourceLoadException
*/ */
BoundedReadIterator toUnmappedReads( long readCount, StingSAMIterator iter ) throws SimpleDataSourceLoadException { BoundedReadIterator toUnmappedReads( long readCount, QueryIterator iter ) throws SimpleDataSourceLoadException {
PeekableIterator<SAMRecord> peekable = new PeekableIterator<SAMRecord>(iter); iter.queryUnmappedReads();
int count = 0; int count = 0;
int cnt = 0;
SAMRecord d = null;
while (peekable.hasNext()) {
d = peekable.peek();
int x = d.getReferenceIndex();
if (x < 0)
// we have the magic read that starts the unmapped read segment!
break;
cnt++;
peekable.next();
}
// check to see what happened, did we run out of reads?
if (!peekable.hasNext()) {
return null;
}
// now walk until we've taken the unmapped read count // now walk until we've taken the unmapped read count
while (peekable.hasNext() && count < this.readsTaken) { while (iter.hasNext() && count < this.readsTaken) {
peekable.next(); iter.next();
count++; count++;
} }
// check to see what happened, did we run out of reads? // check to see what happened, did we run out of reads?
if (!peekable.hasNext()) { if (!iter.hasNext()) {
return null; return null;
} }
// we're not out of unmapped reads, so increment our read cout // we're not out of unmapped reads, so increment our read cout
this.readsTaken += readCount; this.readsTaken += readCount;
return new BoundedReadIterator(StingSAMIteratorAdapter.adapt(reads, peekable), readCount); return new BoundedReadIterator(StingSAMIteratorAdapter.adapt(reads, iter), readCount);
} }
@ -356,20 +358,14 @@ public class SAMDataSource implements SimpleDataSource {
} }
class SAMIteratorPool extends ResourcePool<SamFileHeaderMerger,StingSAMIterator> { class SAMIteratorPool extends ResourcePool<SamFileHeaderMerger, QueryIterator> {
/** /** Source information about the reads. */
* Source information about the reads.
*/
protected Reads reads; protected Reads reads;
/** /** Is this a by-reads traversal or a by-locus? */
* Is this a by-reads traversal or a by-locus?
*/
protected boolean byReads; protected boolean byReads;
/** /** File header for the combined file. */
* File header for the combined file.
*/
protected SAMFileHeader header; protected SAMFileHeader header;
/** our log, which we want to capture anything from this class */ /** our log, which we want to capture anything from this class */
@ -379,54 +375,87 @@ class SAMIteratorPool extends ResourcePool<SamFileHeaderMerger,StingSAMIterator>
this.reads = reads; this.reads = reads;
this.byReads = byReads; this.byReads = byReads;
SamFileHeaderMerger merger = createNewResource( null ); SamFileHeaderMerger merger = createNewResource(null);
this.header = merger.getMergedHeader(); this.header = merger.getMergedHeader();
// Add this resource to the pool. // Add this resource to the pool.
this.addNewResource( merger ); this.addNewResource(merger);
} }
/** /** Get the combined header for all files in the iterator pool. */
* Get the combined header for all files in the iterator pool.
*/
public SAMFileHeader getHeader() { public SAMFileHeader getHeader() {
return header; return header;
} }
protected SamFileHeaderMerger selectBestExistingResource( GenomeLoc position, List<SamFileHeaderMerger> mergers) { protected SamFileHeaderMerger selectBestExistingResource( GenomeLoc position, List<SamFileHeaderMerger> mergers ) {
if( mergers.size() == 0 ) if (mergers.size() == 0)
return null; return null;
return mergers.get(0); return mergers.get(0);
} }
protected SamFileHeaderMerger createNewResource( GenomeLoc position ) { protected SamFileHeaderMerger createNewResource( GenomeLoc position ) {
return createHeaderMerger( reads, SAMFileHeader.SortOrder.coordinate ); return createHeaderMerger(reads, SAMFileHeader.SortOrder.coordinate);
} }
protected StingSAMIterator createIteratorFromResource( GenomeLoc loc, SamFileHeaderMerger headerMerger ) { protected QueryIterator createIteratorFromResource( GenomeLoc loc, SamFileHeaderMerger headerMerger ) {
final MergingSamRecordIterator2 iterator = new MergingSamRecordIterator2(headerMerger, reads); final MergingSamRecordIterator2 iterator = new MergingSamRecordIterator2(headerMerger, reads);
if( loc != null ) { if (loc != null) {
if (byReads) if (byReads)
iterator.queryContained(loc.getContig(), (int) loc.getStart(), (int) loc.getStop()); iterator.queryContained(loc.getContig(), (int) loc.getStart(), (int) loc.getStop());
else else
iterator.queryOverlapping(loc.getContig(), (int) loc.getStart(), (int) loc.getStop()); iterator.queryOverlapping(loc.getContig(), (int) loc.getStart(), (int) loc.getStop());
} }
return new StingSAMIterator() { return new QueryIterator() {
public Reads getSourceInfo() { return reads; } public Reads getSourceInfo() {
return reads;
}
public void close() { public void close() {
iterator.close(); iterator.close();
release(this); release(this);
} }
public Iterator<SAMRecord> iterator() { return this; }
public boolean hasNext() { return iterator.hasNext(); } public Iterator<SAMRecord> iterator() {
public SAMRecord next() { return iterator.next(); } return this;
public void remove() { throw new UnsupportedOperationException("Can't remove from a StingSAMIterator"); } }
public boolean hasNext() {
return iterator.hasNext();
}
public SAMRecord next() {
return iterator.next();
}
public void remove() {
throw new UnsupportedOperationException("Can't remove from a StingSAMIterator");
}
public SAMRecord peek() {
return iterator.peek();
}
public void queryOverlapping( String contig, int start, int stop ) {
iterator.queryOverlapping(contig, start, stop);
}
public void query( String contig, int start, int stop, boolean contained ) {
iterator.query(contig, start, stop, contained);
}
public void queryUnmappedReads() {
iterator.queryUnmappedReads();
}
public void queryContained( String contig, int start, int stop ) {
iterator.queryContained(contig, start, stop);
}
}; };
} }
protected void closeResource( SamFileHeaderMerger resource ) { protected void closeResource( SamFileHeaderMerger resource ) {
for( SAMFileReader reader: resource.getReaders() ) for (SAMFileReader reader : resource.getReaders())
reader.close(); reader.close();
} }

View File

@ -34,6 +34,7 @@ import net.sf.samtools.util.CloseableIterator;
import org.apache.log4j.Logger; import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.Reads; import org.broadinstitute.sting.gatk.Reads;
import org.broadinstitute.sting.utils.StingException; import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.Utils;
import java.lang.reflect.Constructor; import java.lang.reflect.Constructor;
import java.util.Comparator; import java.util.Comparator;
@ -46,7 +47,7 @@ import java.util.PriorityQueue;
* iterable stream. The underlying iterators/files must all have the same sort order unless * iterable stream. The underlying iterators/files must all have the same sort order unless
* the requested output format is unsorted, in which case any combination is valid. * the requested output format is unsorted, in which case any combination is valid.
*/ */
public class MergingSamRecordIterator2 implements CloseableIterator<SAMRecord>, Iterable<SAMRecord>, PeekingStingIterator { public class MergingSamRecordIterator2 implements CloseableIterator<SAMRecord>, Iterable<SAMRecord>, QueryIterator {
protected PriorityQueue<ComparableSamRecordIterator> pq = null; protected PriorityQueue<ComparableSamRecordIterator> pq = null;
protected final SamFileHeaderMerger samHeaderMerger; protected final SamFileHeaderMerger samHeaderMerger;
protected final SAMFileHeader.SortOrder sortOrder; protected final SAMFileHeader.SortOrder sortOrder;
@ -54,6 +55,7 @@ public class MergingSamRecordIterator2 implements CloseableIterator<SAMRecord>,
private SAMRecord mNextRecord; private SAMRecord mNextRecord;
protected boolean initialized = false; protected boolean initialized = false;
protected final Reads reads; protected final Reads reads;
protected boolean warnedUserAboutSortOrder = false; // so we only warn the user once
/** /**
* Constructs a new merging iterator with the same set of readers and sort order as * Constructs a new merging iterator with the same set of readers and sort order as
@ -62,7 +64,7 @@ public class MergingSamRecordIterator2 implements CloseableIterator<SAMRecord>,
* @param headerMerger the header to merge * @param headerMerger the header to merge
* @param reads the reads pile * @param reads the reads pile
*/ */
public MergingSamRecordIterator2(final SamFileHeaderMerger headerMerger, Reads reads) { public MergingSamRecordIterator2( final SamFileHeaderMerger headerMerger, Reads reads ) {
this.samHeaderMerger = headerMerger; this.samHeaderMerger = headerMerger;
this.reads = reads; this.reads = reads;
this.sortOrder = headerMerger.getMergedHeader().getSortOrder(); this.sortOrder = headerMerger.getMergedHeader().getSortOrder();
@ -90,11 +92,24 @@ public class MergingSamRecordIterator2 implements CloseableIterator<SAMRecord>,
} }
private void checkSortOrder(SAMFileReader reader) { /**
* verify the sort order
*
* @param reader the reader to check
*/
private void checkSortOrder( SAMFileReader reader ) {
if (this.sortOrder != SAMFileHeader.SortOrder.unsorted && reader.getFileHeader().getSortOrder() != this.sortOrder) { if (this.sortOrder != SAMFileHeader.SortOrder.unsorted && reader.getFileHeader().getSortOrder() != this.sortOrder) {
throw new PicardException("Files are not compatible with sort order: " + reader.getFileHeader().getSortOrder() + if (reads.getSafetyChecking()) {
" vrs " + this.sortOrder + ". Make sure that the SO flag in your bam file is set (The reader attribute for sort order equals " throw new PicardException("Files are not compatible with sort order: " + reader.getFileHeader().getSortOrder() +
+ reader.getFileHeader().getAttribute("SO") + " in this case)."); " vrs " + this.sortOrder + ". Make sure that the SO flag in your bam file is set (The reader attribute for sort order equals "
+ reader.getFileHeader().getAttribute("SO") + " in this case).");
} else if(!warnedUserAboutSortOrder) {
warnedUserAboutSortOrder = true;
Utils.warnUser("Files are not compatible with sort order: " + reader.getFileHeader().getSortOrder() +
" vrs " + this.sortOrder + ". Make sure that the SO flag in your bam file is set (The reader attribute for sort order equals "
+ reader.getFileHeader().getAttribute("SO") + " in this case).");
}
} }
} }
@ -102,7 +117,7 @@ public class MergingSamRecordIterator2 implements CloseableIterator<SAMRecord>,
return true; return true;
} }
public void queryOverlapping(final String contig, final int start, final int stop) { public void queryOverlapping( final String contig, final int start, final int stop ) {
if (initialized) { if (initialized) {
throw new IllegalStateException("You cannot double initialize a MergingSamRecordIterator2"); throw new IllegalStateException("You cannot double initialize a MergingSamRecordIterator2");
} }
@ -118,7 +133,7 @@ public class MergingSamRecordIterator2 implements CloseableIterator<SAMRecord>,
} }
public void query(final String contig, final int start, final int stop, final boolean contained) { public void query( final String contig, final int start, final int stop, final boolean contained ) {
if (initialized) { if (initialized) {
throw new IllegalStateException("You cannot double initialize a MergingSamRecordIterator2"); throw new IllegalStateException("You cannot double initialize a MergingSamRecordIterator2");
} }
@ -133,7 +148,22 @@ public class MergingSamRecordIterator2 implements CloseableIterator<SAMRecord>,
} }
public void queryContained(final String contig, final int start, final int stop) { public void queryUnmappedReads() {
if (initialized) {
throw new IllegalStateException("You cannot double initialize a MergingSamRecordIterator2");
}
final SAMRecordComparator comparator = getComparator();
for (final SAMFileReader reader : samHeaderMerger.getReaders()) {
Iterator<SAMRecord> recordIter = reader.queryUnmapped();
final ComparableSamRecordIterator iterator = new ComparableSamRecordIterator(reader, recordIter, comparator);
addIfNotEmpty(iterator);
}
setInitialized();
}
public void queryContained( final String contig, final int start, final int stop ) {
if (initialized) { if (initialized) {
throw new IllegalStateException("You cannot double initialize a MergingSamRecordIterator2"); throw new IllegalStateException("You cannot double initialize a MergingSamRecordIterator2");
} }
@ -237,7 +267,7 @@ public class MergingSamRecordIterator2 implements CloseableIterator<SAMRecord>,
* *
* @param iterator the iterator to add * @param iterator the iterator to add
*/ */
protected void addIfNotEmpty(final ComparableSamRecordIterator iterator) { protected void addIfNotEmpty( final ComparableSamRecordIterator iterator ) {
//System.out.printf("Adding %s %s %d%n", iterator.peek().getReadName(), iterator.peek().getReferenceName(), iterator.peek().getAlignmentStart()); //System.out.printf("Adding %s %s %d%n", iterator.peek().getReadName(), iterator.peek().getReferenceName(), iterator.peek().getAlignmentStart());
if (iterator.hasNext()) { if (iterator.hasNext()) {
pq.offer(iterator); pq.offer(iterator);
@ -262,11 +292,11 @@ public class MergingSamRecordIterator2 implements CloseableIterator<SAMRecord>,
// For unsorted build a fake comparator that compares based on object ID // For unsorted build a fake comparator that compares based on object ID
if (this.sortOrder == SAMFileHeader.SortOrder.unsorted) { if (this.sortOrder == SAMFileHeader.SortOrder.unsorted) {
return new SAMRecordComparator() { return new SAMRecordComparator() {
public int fileOrderCompare(final SAMRecord lhs, final SAMRecord rhs) { public int fileOrderCompare( final SAMRecord lhs, final SAMRecord rhs ) {
return System.identityHashCode(lhs) - System.identityHashCode(rhs); return System.identityHashCode(lhs) - System.identityHashCode(rhs);
} }
public int compare(final SAMRecord lhs, final SAMRecord rhs) { public int compare( final SAMRecord lhs, final SAMRecord rhs ) {
return fileOrderCompare(lhs, rhs); return fileOrderCompare(lhs, rhs);
} }
}; };
@ -345,13 +375,13 @@ class ComparableSamRecordIterator extends PeekableIterator<SAMRecord> implements
* @param sam the SAM file to read records from * @param sam the SAM file to read records from
* @param comparator the Comparator to use to provide ordering fo SAMRecords * @param comparator the Comparator to use to provide ordering fo SAMRecords
*/ */
public ComparableSamRecordIterator(final SAMFileReader sam, final Comparator<SAMRecord> comparator) { public ComparableSamRecordIterator( final SAMFileReader sam, final Comparator<SAMRecord> comparator ) {
super(sam.iterator()); super(sam.iterator());
this.reader = sam; this.reader = sam;
this.comparator = comparator; this.comparator = comparator;
} }
public ComparableSamRecordIterator(final SAMFileReader sam, Iterator<SAMRecord> iterator, final Comparator<SAMRecord> comparator) { public ComparableSamRecordIterator( final SAMFileReader sam, Iterator<SAMRecord> iterator, final Comparator<SAMRecord> comparator ) {
super(iterator); // use the provided iterator super(iterator); // use the provided iterator
this.reader = sam; this.reader = sam;
this.comparator = comparator; this.comparator = comparator;
@ -381,7 +411,7 @@ class ComparableSamRecordIterator extends PeekableIterator<SAMRecord> implements
* *
* @return a negative, 0 or positive number as described in the Comparator interface * @return a negative, 0 or positive number as described in the Comparator interface
*/ */
public int compareTo(final ComparableSamRecordIterator that) { public int compareTo( final ComparableSamRecordIterator that ) {
if (this.comparator.getClass() != that.comparator.getClass()) { if (this.comparator.getClass() != that.comparator.getClass()) {
throw new IllegalStateException("Attempt to compare two ComparableSAMRecordIterators that " + throw new IllegalStateException("Attempt to compare two ComparableSAMRecordIterators that " +
"have different orderings internally"); "have different orderings internally");

View File

@ -1,6 +1,5 @@
package org.broadinstitute.sting.gatk.iterators; package org.broadinstitute.sting.gatk.iterators;
import net.sf.samtools.SAMRecord;
/* /*
* Copyright (c) 2009 The Broad Institute * Copyright (c) 2009 The Broad Institute
@ -30,14 +29,12 @@ import net.sf.samtools.SAMRecord;
/** /**
* @author aaron * @author aaron
* <p/> * <p/>
* Interface Peekable * Class PeekingIterator
* <p/> * <p/>
* This interface indicates that * a peekable interface, that requires a peek() method
*/ */
public interface PeekingStingIterator extends StingSAMIterator { public interface PeekingIterator<T> {
/**
* peek, given the specified type /** @return returns a peeked value */
* @return public T peek();
*/
SAMRecord peek();
} }

View File

@ -0,0 +1,48 @@
package org.broadinstitute.sting.gatk.iterators;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.SAMRecordComparator;
import net.sf.samtools.SAMFileReader;
import java.util.Iterator;
/*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
/**
* @author aaron
*
* This interface indicates that the iterator is query able
*/
public interface QueryIterator extends StingSAMIterator, PeekingIterator<SAMRecord> {
/**
* The required methods to query able
**/
public void queryOverlapping(final String contig, final int start, final int stop);
public void query(final String contig, final int start, final int stop, final boolean contained);
public void queryUnmappedReads();
public void queryContained(final String contig, final int start, final int stop);
}

View File

@ -50,13 +50,13 @@ import java.math.BigInteger;
* *
* A descriptions should go here. Blame aaron if it's missing. * A descriptions should go here. Blame aaron if it's missing.
*/ */
public class ReadValidationWalker extends ReadWalker<SAMRecord, Integer> { public class ReadValidationWalker extends ReadWalker<SAMRecord, SAMRecord> {
// our MD5 sum // our MD5 sum
private MessageDigest m; private MessageDigest m;
// private list of md5sums // private list of md5sums
private final List<BigInteger> list = new ArrayList<BigInteger>(); private final List<String> list = new ArrayList<String>();
/** /**
* The initialize function. * The initialize function.
@ -94,8 +94,8 @@ public class ReadValidationWalker extends ReadWalker<SAMRecord, Integer> {
* bam file, if it was specified on the command line * bam file, if it was specified on the command line
* @return SAMFileWriter, set to the BAM output file if the command line option was set, null otherwise * @return SAMFileWriter, set to the BAM output file if the command line option was set, null otherwise
*/ */
public Integer reduceInit() { public SAMRecord reduceInit() {
return new Integer(0); return null;
} }
/** /**
@ -104,31 +104,16 @@ public class ReadValidationWalker extends ReadWalker<SAMRecord, Integer> {
* @param output the output source * @param output the output source
* @return the SAMFileWriter, so that the next reduce can emit to the same source * @return the SAMFileWriter, so that the next reduce can emit to the same source
*/ */
public Integer reduce( SAMRecord read, Integer output ) { public SAMRecord reduce( SAMRecord read, SAMRecord output ) {
byte[] ray = new byte[read.getReadName().length() + 5]; if (output == null)
int x = 0; return read;
for (char c: read.getReadName().toCharArray()) { if ((read.getReferenceIndex() == output.getReferenceIndex()) && (read.getAlignmentStart() < output.getAlignmentStart())) {
ray[x] = (byte)c; System.err.println("saw the read " + read.getReadName() + " duplicated, old alignment = " + output.getAlignmentStart());
//System.err.println("adding " + c + " to pos " + x);
x++;
} }
//System.err.println(read.getReadName() + " name, alignment = " + read.getAlignmentStart()); else if (read.getReferenceIndex() != output.getReferenceIndex()){
int y = 0; System.err.println("Switching Chromo");
for (;y < 4; y++) {
ray[x+y] = (byte)((read.getAlignmentStart() >> y * 8) & 0x000f);
} }
ray[x+y] = read.getSecondOfPairFlag() ? (byte)1 : (byte)0; return read;
BigInteger bigInt = new BigInteger(m.digest(ray));
//System.err.println(bigInt.toString());
m.reset();
if (this.list.contains(bigInt)) {
throw new StingException("Seen Read: " + bigInt + "-> " + read.getReadName() + " before (list size = " + list.size() + ")");
}
if (read.getAlignmentStart() < output) {
throw new StingException("Seen Read " + read.getReadName() + " has alignment of " + read.getAlignmentStart() + " before (list size = " + output + ")");
}
list.add(bigInt);
return read.getAlignmentStart();
} }

View File

@ -1,9 +1,12 @@
package org.broadinstitute.sting.utils.sam; package org.broadinstitute.sting.utils.sam;
import org.broadinstitute.sting.gatk.iterators.PeekingStingIterator; import org.broadinstitute.sting.gatk.iterators.QueryIterator;
import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
import org.broadinstitute.sting.gatk.Reads; import org.broadinstitute.sting.gatk.Reads;
import net.sf.samtools.SAMRecord; import net.sf.samtools.SAMRecord;
import net.sf.samtools.SAMFileHeader; import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.util.CloseableIterator;
import net.sf.picard.util.PeekableIterator;
import java.util.Iterator; import java.util.Iterator;
@ -34,7 +37,7 @@ import java.util.Iterator;
*/ */
/** this fake iterator allows us to look at how specific piles of reads are handled */ /** this fake iterator allows us to look at how specific piles of reads are handled */
public class ArtificialSAMIterator implements PeekingStingIterator { public class ArtificialSAMIterator implements StingSAMIterator {
protected int currentChromo = 0; protected int currentChromo = 0;

View File

@ -7,6 +7,7 @@ import net.sf.samtools.SAMRecord;
import java.util.List; import java.util.List;
import org.broadinstitute.sting.utils.StingException; import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.gatk.iterators.QueryIterator;
/* /*
@ -41,7 +42,7 @@ import org.broadinstitute.sting.utils.StingException;
* to test out classes that use specific itervals. The reads returned will * to test out classes that use specific itervals. The reads returned will
* all lie in order in the specified interval. * all lie in order in the specified interval.
*/ */
public class ArtificialSAMQueryIterator extends ArtificialSAMIterator { public class ArtificialSAMQueryIterator extends ArtificialSAMIterator implements QueryIterator {
// get the next positon // get the next positon
protected int finalPos = 0; protected int finalPos = 0;
@ -88,6 +89,39 @@ public class ArtificialSAMQueryIterator extends ArtificialSAMIterator {
initialize(contig, start, stop); initialize(contig, start, stop);
} }
@Override
public void query( String contig, int start, int stop, boolean contained ) {
if (contained)
queryContained(contig, start, stop);
else
queryOverlapping(contig, start, stop);
}
@Override
public void queryUnmappedReads() {
initializeUnmapped();
}
/**
* initialize the iterator to an unmapped read position
*/
public void initializeUnmapped() {
ensureUntouched();
while (super.hasNext() && this.peek().getReferenceIndex() >= 0) {
super.next();
}
// sanity check that we have an actual matching read next
SAMRecord rec = this.peek();
if (rec == null) {
throw new StingException("The next read doesn't match");
}
// set the seeked variable to true
seeked = true;
}
/** /**
* initialize the query iterator * initialize the query iterator
@ -141,6 +175,11 @@ public class ArtificialSAMQueryIterator extends ArtificialSAMIterator {
if (rec.getReferenceIndex() != this.contigIndex) { if (rec.getReferenceIndex() != this.contigIndex) {
return false; return false;
} }
// if we have an unmapped read, matching the contig is good enough for us
if (rec.getReferenceIndex() < 0) {
return true;
}
if (!overlapping) { if (!overlapping) {
// if the start or the end are somewhere within our range // if the start or the end are somewhere within our range
if (( rec.getAlignmentStart() >= startPos && rec.getAlignmentEnd() <= finalPos )) { if (( rec.getAlignmentStart() >= startPos && rec.getAlignmentEnd() <= finalPos )) {

View File

@ -5,8 +5,7 @@ import net.sf.samtools.*;
import java.io.File; import java.io.File;
import java.util.*; import java.util.*;
import org.broadinstitute.sting.gatk.iterators.PeekingStingIterator; import org.broadinstitute.sting.gatk.iterators.QueryIterator;
import org.broadinstitute.sting.gatk.Reads;
import org.broadinstitute.sting.utils.StingException; import org.broadinstitute.sting.utils.StingException;
/** /**
@ -103,6 +102,51 @@ public class ArtificialSAMUtils {
return header; return header;
} }
/**
* setup a default read group for a SAMFileHeader
*
* @param header the header to set
* @param readGroupID the read group ID tag
* @param sampleName the sample name
*
* @return the adjusted SAMFileHeader
*/
public static SAMFileHeader createDefaultReadGroup( SAMFileHeader header, String readGroupID, String sampleName ) {
SAMReadGroupRecord rec = new SAMReadGroupRecord(readGroupID);
rec.setSample(sampleName);
List<SAMReadGroupRecord> readGroups = new ArrayList<SAMReadGroupRecord>();
readGroups.add(rec);
header.setReadGroups(readGroups);
return header;
}
/**
* setup read groups for the specified read groups and sample names
*
* @param header the header to set
* @param readGroupIDs the read group ID tags
* @param sampleNames the sample names
*
* @return the adjusted SAMFileHeader
*/
public static SAMFileHeader createEnumeratedReadGroups( SAMFileHeader header, List<String> readGroupIDs, List<String> sampleNames ) {
if (readGroupIDs.size() != sampleNames.size()) {
throw new StingException("read group count and sample name count must be the same");
}
List<SAMReadGroupRecord> readGroups = new ArrayList<SAMReadGroupRecord>();
int x = 0;
for (; x < readGroupIDs.size(); x++) {
SAMReadGroupRecord rec = new SAMReadGroupRecord(readGroupIDs.get(x));
rec.setSample(sampleNames.get(x));
readGroups.add(rec);
}
header.setReadGroups(readGroups);
return header;
}
/** /**
* Create an artificial read based on the parameters. The cigar string will be *M, where * is the length of the read * Create an artificial read based on the parameters. The cigar string will be *M, where * is the length of the read
* *
@ -115,7 +159,7 @@ public class ArtificialSAMUtils {
* @return the artificial read * @return the artificial read
*/ */
public static SAMRecord createArtificialRead( SAMFileHeader header, String name, int refIndex, int alignmentStart, int length ) { public static SAMRecord createArtificialRead( SAMFileHeader header, String name, int refIndex, int alignmentStart, int length ) {
if( alignmentStart == 0 ) if (alignmentStart == 0)
throw new StingException("Invalid alignment start for artificial read"); throw new StingException("Invalid alignment start for artificial read");
SAMRecord record = new SAMRecord(header); SAMRecord record = new SAMRecord(header);
record.setReadName(name); record.setReadName(name);
@ -128,20 +172,26 @@ public class ArtificialSAMUtils {
return record; return record;
} }
/** /**
* create an iterator containing the specified read piles * Create an artificial read based on the parameters. The cigar string will be *M, where * is the length of the read
* *
* @param startingChr the chromosome (reference ID) to start from * @param header the SAM header to associate the read with
* @param endingChr the id to end with * @param name the name of the read
* @param readCount the number of reads per chromosome * @param refIndex the reference index, i.e. what chromosome to associate it with
* @param alignmentStart where to start the alignment
* @param bases the sequence of the read
* @param qual the qualities of the read
* *
* @return StingSAMIterator representing the specified amount of fake data * @return the artificial read
*/ */
public static PeekingStingIterator unmappedReadIterator( int startingChr, int endingChr, int readCount ) { public static SAMRecord createArtificialRead( SAMFileHeader header, String name, int refIndex, int alignmentStart, byte[] bases, byte[] qual ) {
SAMFileHeader header = createArtificialSamHeader(( endingChr - startingChr ) + 1, startingChr, readCount + DEFAULT_READ_LENGTH); if (bases.length != qual.length) {
throw new StingException("Passed in read string is different length then the quality array");
return new ArtificialSAMIterator(startingChr, endingChr, readCount, header); }
SAMRecord rec = createArtificialRead(header, name, refIndex, alignmentStart, bases.length);
rec.setReadBases(bases);
rec.setBaseQualities(bases);
return rec;
} }
/** /**
@ -150,14 +200,29 @@ public class ArtificialSAMUtils {
* @param startingChr the chromosome (reference ID) to start from * @param startingChr the chromosome (reference ID) to start from
* @param endingChr the id to end with * @param endingChr the id to end with
* @param readCount the number of reads per chromosome * @param readCount the number of reads per chromosome
*
* @return StingSAMIterator representing the specified amount of fake data
*/
public static QueryIterator unmappedReadIterator( int startingChr, int endingChr, int readCount ) {
SAMFileHeader header = createArtificialSamHeader(( endingChr - startingChr ) + 1, startingChr, readCount + DEFAULT_READ_LENGTH);
return new ArtificialSAMQueryIterator(startingChr, endingChr, readCount, 0, header);
}
/**
* create an iterator containing the specified read piles
*
* @param startingChr the chromosome (reference ID) to start from
* @param endingChr the id to end with
* @param readCount the number of reads per chromosome
* @param unmappedReadCount the count of unmapped reads to place at the end of the iterator, like in a sorted bam file * @param unmappedReadCount the count of unmapped reads to place at the end of the iterator, like in a sorted bam file
* *
* @return StingSAMIterator representing the specified amount of fake data * @return StingSAMIterator representing the specified amount of fake data
*/ */
public static PeekingStingIterator unmappedReadIterator( int startingChr, int endingChr, int readCount, int unmappedReadCount ) { public static QueryIterator unmappedReadIterator( int startingChr, int endingChr, int readCount, int unmappedReadCount ) {
SAMFileHeader header = createArtificialSamHeader(( endingChr - startingChr ) + 1, startingChr, readCount + DEFAULT_READ_LENGTH); SAMFileHeader header = createArtificialSamHeader(( endingChr - startingChr ) + 1, startingChr, readCount + DEFAULT_READ_LENGTH);
return new ArtificialSAMIterator(startingChr, endingChr, readCount, unmappedReadCount, header); return new ArtificialSAMQueryIterator(startingChr, endingChr, readCount, unmappedReadCount, header);
} }
/** /**
@ -178,9 +243,9 @@ public class ArtificialSAMUtils {
/** /**
* create an ArtificialSAMQueryIterator containing the specified read piles * create an ArtificialSAMQueryIterator containing the specified read piles
* *
* @param startingChr the chromosome (reference ID) to start from * @param startingChr the chromosome (reference ID) to start from
* @param endingChr the id to end with * @param endingChr the id to end with
* @param readCount the number of reads per chromosome * @param readCount the number of reads per chromosome
* @param unmappedReadCount the count of unmapped reads to place at the end of the iterator, like in a sorted bam file * @param unmappedReadCount the count of unmapped reads to place at the end of the iterator, like in a sorted bam file
* *
* @return StingSAMIterator representing the specified amount of fake data * @return StingSAMIterator representing the specified amount of fake data

View File

@ -3,7 +3,6 @@ package org.broadinstitute.sting.gatk.datasources.simpleDataSources;
import static junit.framework.Assert.fail; import static junit.framework.Assert.fail;
import net.sf.samtools.SAMFileReader; import net.sf.samtools.SAMFileReader;
import net.sf.samtools.SAMFileHeader; import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.gatk.datasources.shards.ShardStrategy; import org.broadinstitute.sting.gatk.datasources.shards.ShardStrategy;
import org.broadinstitute.sting.gatk.datasources.shards.ShardStrategyFactory; import org.broadinstitute.sting.gatk.datasources.shards.ShardStrategyFactory;
@ -84,7 +83,7 @@ public class SAMByReadsTest extends BaseTest {
SAMDataSource data = new SAMDataSource(reads,true); SAMDataSource data = new SAMDataSource(reads,true);
for (int x = 0; x < 10; x++) { for (int x = 0; x < 10; x++) {
++iterations; ++iterations;
PeekingStingIterator iter = ArtificialSAMUtils.unmappedReadIterator(1, 100, 10, 1000); QueryIterator iter = ArtificialSAMUtils.unmappedReadIterator(1, 100, 10, 1000);
BoundedReadIterator ret = data.toUnmappedReads(100, iter); BoundedReadIterator ret = data.toUnmappedReads(100, iter);
// count the reads we've gotten back // count the reads we've gotten back
if (ret == null) { if (ret == null) {
@ -173,7 +172,7 @@ public class SAMByReadsTest extends BaseTest {
while (shardStrategy.hasNext()) { while (shardStrategy.hasNext()) {
BoundedReadIterator ret = (BoundedReadIterator)data.seek(shardStrategy.next()); StingSAMIterator ret = data.seek(shardStrategy.next());
assertTrue(ret != null); assertTrue(ret != null);
while (ret.hasNext()) { while (ret.hasNext()) {
ret.next(); ret.next();
@ -227,7 +226,7 @@ class ArtificialResourcePool extends SAMIteratorPool {
} }
@Override @Override
public StingSAMIterator iterator( GenomeLoc loc ) { public QueryIterator iterator( GenomeLoc loc ) {
ArtificialSAMQueryIterator iter = ArtificialSAMUtils.queryReadIterator(1, 10, 100, 1000); ArtificialSAMQueryIterator iter = ArtificialSAMUtils.queryReadIterator(1, 10, 100, 1000);
if (loc != null) { if (loc != null) {
iter.queryContained(loc.getContig(), (int)loc.getStart(), (int)loc.getStop()); iter.queryContained(loc.getContig(), (int)loc.getStart(), (int)loc.getStop());

View File

@ -2,7 +2,7 @@ package org.broadinstitute.sting.utils.sam;
import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
import org.broadinstitute.sting.gatk.iterators.PeekingStingIterator; import org.broadinstitute.sting.gatk.iterators.QueryIterator;
import org.junit.Test; import org.junit.Test;
import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertEquals;
import static org.junit.Assert.fail; import static org.junit.Assert.fail;
@ -80,30 +80,5 @@ public class ArtificialSAMUtilsTest extends BaseTest {
assertEquals(1000, count); assertEquals(1000, count);
} }
@Test
public void testPeeking() {
PeekingStingIterator iter = ArtificialSAMUtils.unmappedReadIterator(1, 100, 100);
int count = 0;
while (iter.hasNext()) {
int readCnt = ((ArtificialSAMIterator)(iter)).readsTaken();
// peek the record
SAMRecord rec = iter.peek();
assertTrue(rec.getReferenceIndex() >= 0);
// next the record
SAMRecord rec2 = iter.next();
assertTrue(rec2.getReadName() == rec.getReadName());
assertTrue(rec2.getAlignmentStart() == rec.getAlignmentStart());
// find out how many reads we've taken now
int readCnt2 = ((ArtificialSAMIterator)(iter)).readsTaken();
count++;
if (count < 100*100) assertEquals(readCnt + 1, readCnt2);
else assertEquals(readCnt, readCnt2);
}
assertEquals(100 * 100, count );
}
} }