added changes to support alec toUnmappedRead seek. Huge improvements (orders of magnitude) in unmapped read performance.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1021 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
aaron 2009-06-16 22:15:56 +00:00
parent 4f6d26849f
commit 6ee64c7e43
10 changed files with 340 additions and 170 deletions

View File

@ -1,7 +1,6 @@
package org.broadinstitute.sting.gatk.datasources.simpleDataSources;
import net.sf.picard.sam.SamFileHeaderMerger;
import net.sf.picard.util.PeekableIterator;
import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMFileReader;
import net.sf.samtools.SAMReadGroupRecord;
@ -21,24 +20,43 @@ import java.util.ArrayList;
import java.util.List;
import java.util.Iterator;
/*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
/**
* User: aaron
* Date: Mar 26, 2009
* Time: 2:36:16 PM
* <p/>
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
* <p/>
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*
* Converts shards to SAM iterators over the specified region
*/
public class SAMDataSource implements SimpleDataSource {
/** Backing support for reads. */
private Reads reads = null;
private final Reads reads;
/** our log, which we want to capture anything from this class */
protected static Logger logger = Logger.getLogger(SAMDataSource.class);
@ -62,7 +80,7 @@ public class SAMDataSource implements SimpleDataSource {
/**
* constructor, given sam files
*
* @param reads the list of sam files
* @param reads the list of sam files
* @param byReads are we a by reads traversal, or a loci traversal. We could delete this field
* if we passed in iterGen, which would be a better (although more complicated for the
* consumers of SAMDataSources).
@ -79,11 +97,12 @@ public class SAMDataSource implements SimpleDataSource {
throw new SimpleDataSourceLoadException("SAMDataSource: Unable to load file: " + smFile.getName());
}
}
iteratorPool = new SAMIteratorPool(reads,byReads);
iteratorPool = new SAMIteratorPool(reads, byReads);
}
/**
* For unit testing, add a custom iterator pool.
*
* @param iteratorPool Custom mock iterator pool.
*/
void setResourcePool( SAMIteratorPool iteratorPool ) {
@ -100,7 +119,7 @@ public class SAMDataSource implements SimpleDataSource {
*
* @return an iterator for that region
*/
public StingSAMIterator seekLocus(GenomeLoc location) throws SimpleDataSourceLoadException {
public StingSAMIterator seekLocus( GenomeLoc location ) throws SimpleDataSourceLoadException {
return iteratorPool.iterator(location);
}
@ -180,7 +199,7 @@ public class SAMDataSource implements SimpleDataSource {
iter.close();
}
iter = iteratorPool.iterator(null);
bound = toUnmappedReads(shard.getSize(), iter);
bound = toUnmappedReads(shard.getSize(), (QueryIterator) iter);
}
if (bound == null) {
shard.signalDone();
@ -211,41 +230,24 @@ public class SAMDataSource implements SimpleDataSource {
* @return the bounded iterator that you can use to get the intervaled reads from
* @throws SimpleDataSourceLoadException
*/
BoundedReadIterator toUnmappedReads( long readCount, StingSAMIterator iter ) throws SimpleDataSourceLoadException {
PeekableIterator<SAMRecord> peekable = new PeekableIterator<SAMRecord>(iter);
BoundedReadIterator toUnmappedReads( long readCount, QueryIterator iter ) throws SimpleDataSourceLoadException {
iter.queryUnmappedReads();
int count = 0;
int cnt = 0;
SAMRecord d = null;
while (peekable.hasNext()) {
d = peekable.peek();
int x = d.getReferenceIndex();
if (x < 0)
// we have the magic read that starts the unmapped read segment!
break;
cnt++;
peekable.next();
}
// check to see what happened, did we run out of reads?
if (!peekable.hasNext()) {
return null;
}
// now walk until we've taken the unmapped read count
while (peekable.hasNext() && count < this.readsTaken) {
peekable.next();
while (iter.hasNext() && count < this.readsTaken) {
iter.next();
count++;
}
// check to see what happened, did we run out of reads?
if (!peekable.hasNext()) {
if (!iter.hasNext()) {
return null;
}
// we're not out of unmapped reads, so increment our read cout
this.readsTaken += readCount;
return new BoundedReadIterator(StingSAMIteratorAdapter.adapt(reads, peekable), readCount);
return new BoundedReadIterator(StingSAMIteratorAdapter.adapt(reads, iter), readCount);
}
@ -356,77 +358,104 @@ public class SAMDataSource implements SimpleDataSource {
}
class SAMIteratorPool extends ResourcePool<SamFileHeaderMerger,StingSAMIterator> {
/**
* Source information about the reads.
*/
class SAMIteratorPool extends ResourcePool<SamFileHeaderMerger, QueryIterator> {
/** Source information about the reads. */
protected Reads reads;
/**
* Is this a by-reads traversal or a by-locus?
*/
/** Is this a by-reads traversal or a by-locus? */
protected boolean byReads;
/**
* File header for the combined file.
*/
/** File header for the combined file. */
protected SAMFileHeader header;
/** our log, which we want to capture anything from this class */
protected static Logger logger = Logger.getLogger(SAMIteratorPool.class);
protected static Logger logger = Logger.getLogger(SAMIteratorPool.class);
public SAMIteratorPool( Reads reads, boolean byReads ) {
this.reads = reads;
this.byReads = byReads;
SamFileHeaderMerger merger = createNewResource( null );
SamFileHeaderMerger merger = createNewResource(null);
this.header = merger.getMergedHeader();
// Add this resource to the pool.
this.addNewResource( merger );
this.addNewResource(merger);
}
/**
* Get the combined header for all files in the iterator pool.
*/
/** Get the combined header for all files in the iterator pool. */
public SAMFileHeader getHeader() {
return header;
}
protected SamFileHeaderMerger selectBestExistingResource( GenomeLoc position, List<SamFileHeaderMerger> mergers) {
if( mergers.size() == 0 )
protected SamFileHeaderMerger selectBestExistingResource( GenomeLoc position, List<SamFileHeaderMerger> mergers ) {
if (mergers.size() == 0)
return null;
return mergers.get(0);
}
protected SamFileHeaderMerger createNewResource( GenomeLoc position ) {
return createHeaderMerger( reads, SAMFileHeader.SortOrder.coordinate );
return createHeaderMerger(reads, SAMFileHeader.SortOrder.coordinate);
}
protected StingSAMIterator createIteratorFromResource( GenomeLoc loc, SamFileHeaderMerger headerMerger ) {
protected QueryIterator createIteratorFromResource( GenomeLoc loc, SamFileHeaderMerger headerMerger ) {
final MergingSamRecordIterator2 iterator = new MergingSamRecordIterator2(headerMerger, reads);
if( loc != null ) {
if (loc != null) {
if (byReads)
iterator.queryContained(loc.getContig(), (int) loc.getStart(), (int) loc.getStop());
else
iterator.queryOverlapping(loc.getContig(), (int) loc.getStart(), (int) loc.getStop());
}
return new StingSAMIterator() {
public Reads getSourceInfo() { return reads; }
return new QueryIterator() {
public Reads getSourceInfo() {
return reads;
}
public void close() {
iterator.close();
release(this);
}
public Iterator<SAMRecord> iterator() { return this; }
public boolean hasNext() { return iterator.hasNext(); }
public SAMRecord next() { return iterator.next(); }
public void remove() { throw new UnsupportedOperationException("Can't remove from a StingSAMIterator"); }
public Iterator<SAMRecord> iterator() {
return this;
}
public boolean hasNext() {
return iterator.hasNext();
}
public SAMRecord next() {
return iterator.next();
}
public void remove() {
throw new UnsupportedOperationException("Can't remove from a StingSAMIterator");
}
public SAMRecord peek() {
return iterator.peek();
}
public void queryOverlapping( String contig, int start, int stop ) {
iterator.queryOverlapping(contig, start, stop);
}
public void query( String contig, int start, int stop, boolean contained ) {
iterator.query(contig, start, stop, contained);
}
public void queryUnmappedReads() {
iterator.queryUnmappedReads();
}
public void queryContained( String contig, int start, int stop ) {
iterator.queryContained(contig, start, stop);
}
};
}
protected void closeResource( SamFileHeaderMerger resource ) {
for( SAMFileReader reader: resource.getReaders() )
for (SAMFileReader reader : resource.getReaders())
reader.close();
}

View File

@ -34,6 +34,7 @@ import net.sf.samtools.util.CloseableIterator;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.Reads;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.Utils;
import java.lang.reflect.Constructor;
import java.util.Comparator;
@ -46,7 +47,7 @@ import java.util.PriorityQueue;
* iterable stream. The underlying iterators/files must all have the same sort order unless
* the requested output format is unsorted, in which case any combination is valid.
*/
public class MergingSamRecordIterator2 implements CloseableIterator<SAMRecord>, Iterable<SAMRecord>, PeekingStingIterator {
public class MergingSamRecordIterator2 implements CloseableIterator<SAMRecord>, Iterable<SAMRecord>, QueryIterator {
protected PriorityQueue<ComparableSamRecordIterator> pq = null;
protected final SamFileHeaderMerger samHeaderMerger;
protected final SAMFileHeader.SortOrder sortOrder;
@ -54,6 +55,7 @@ public class MergingSamRecordIterator2 implements CloseableIterator<SAMRecord>,
private SAMRecord mNextRecord;
protected boolean initialized = false;
protected final Reads reads;
protected boolean warnedUserAboutSortOrder = false; // so we only warn the user once
/**
* Constructs a new merging iterator with the same set of readers and sort order as
@ -62,7 +64,7 @@ public class MergingSamRecordIterator2 implements CloseableIterator<SAMRecord>,
* @param headerMerger the header to merge
* @param reads the reads pile
*/
public MergingSamRecordIterator2(final SamFileHeaderMerger headerMerger, Reads reads) {
public MergingSamRecordIterator2( final SamFileHeaderMerger headerMerger, Reads reads ) {
this.samHeaderMerger = headerMerger;
this.reads = reads;
this.sortOrder = headerMerger.getMergedHeader().getSortOrder();
@ -90,11 +92,24 @@ public class MergingSamRecordIterator2 implements CloseableIterator<SAMRecord>,
}
private void checkSortOrder(SAMFileReader reader) {
/**
* verify the sort order
*
* @param reader the reader to check
*/
private void checkSortOrder( SAMFileReader reader ) {
if (this.sortOrder != SAMFileHeader.SortOrder.unsorted && reader.getFileHeader().getSortOrder() != this.sortOrder) {
throw new PicardException("Files are not compatible with sort order: " + reader.getFileHeader().getSortOrder() +
" vrs " + this.sortOrder + ". Make sure that the SO flag in your bam file is set (The reader attribute for sort order equals "
+ reader.getFileHeader().getAttribute("SO") + " in this case).");
if (reads.getSafetyChecking()) {
throw new PicardException("Files are not compatible with sort order: " + reader.getFileHeader().getSortOrder() +
" vrs " + this.sortOrder + ". Make sure that the SO flag in your bam file is set (The reader attribute for sort order equals "
+ reader.getFileHeader().getAttribute("SO") + " in this case).");
} else if(!warnedUserAboutSortOrder) {
warnedUserAboutSortOrder = true;
Utils.warnUser("Files are not compatible with sort order: " + reader.getFileHeader().getSortOrder() +
" vrs " + this.sortOrder + ". Make sure that the SO flag in your bam file is set (The reader attribute for sort order equals "
+ reader.getFileHeader().getAttribute("SO") + " in this case).");
}
}
}
@ -102,7 +117,7 @@ public class MergingSamRecordIterator2 implements CloseableIterator<SAMRecord>,
return true;
}
public void queryOverlapping(final String contig, final int start, final int stop) {
public void queryOverlapping( final String contig, final int start, final int stop ) {
if (initialized) {
throw new IllegalStateException("You cannot double initialize a MergingSamRecordIterator2");
}
@ -118,7 +133,7 @@ public class MergingSamRecordIterator2 implements CloseableIterator<SAMRecord>,
}
public void query(final String contig, final int start, final int stop, final boolean contained) {
public void query( final String contig, final int start, final int stop, final boolean contained ) {
if (initialized) {
throw new IllegalStateException("You cannot double initialize a MergingSamRecordIterator2");
}
@ -133,7 +148,22 @@ public class MergingSamRecordIterator2 implements CloseableIterator<SAMRecord>,
}
public void queryContained(final String contig, final int start, final int stop) {
public void queryUnmappedReads() {
if (initialized) {
throw new IllegalStateException("You cannot double initialize a MergingSamRecordIterator2");
}
final SAMRecordComparator comparator = getComparator();
for (final SAMFileReader reader : samHeaderMerger.getReaders()) {
Iterator<SAMRecord> recordIter = reader.queryUnmapped();
final ComparableSamRecordIterator iterator = new ComparableSamRecordIterator(reader, recordIter, comparator);
addIfNotEmpty(iterator);
}
setInitialized();
}
public void queryContained( final String contig, final int start, final int stop ) {
if (initialized) {
throw new IllegalStateException("You cannot double initialize a MergingSamRecordIterator2");
}
@ -237,7 +267,7 @@ public class MergingSamRecordIterator2 implements CloseableIterator<SAMRecord>,
*
* @param iterator the iterator to add
*/
protected void addIfNotEmpty(final ComparableSamRecordIterator iterator) {
protected void addIfNotEmpty( final ComparableSamRecordIterator iterator ) {
//System.out.printf("Adding %s %s %d%n", iterator.peek().getReadName(), iterator.peek().getReferenceName(), iterator.peek().getAlignmentStart());
if (iterator.hasNext()) {
pq.offer(iterator);
@ -262,11 +292,11 @@ public class MergingSamRecordIterator2 implements CloseableIterator<SAMRecord>,
// For unsorted build a fake comparator that compares based on object ID
if (this.sortOrder == SAMFileHeader.SortOrder.unsorted) {
return new SAMRecordComparator() {
public int fileOrderCompare(final SAMRecord lhs, final SAMRecord rhs) {
public int fileOrderCompare( final SAMRecord lhs, final SAMRecord rhs ) {
return System.identityHashCode(lhs) - System.identityHashCode(rhs);
}
public int compare(final SAMRecord lhs, final SAMRecord rhs) {
public int compare( final SAMRecord lhs, final SAMRecord rhs ) {
return fileOrderCompare(lhs, rhs);
}
};
@ -345,13 +375,13 @@ class ComparableSamRecordIterator extends PeekableIterator<SAMRecord> implements
* @param sam the SAM file to read records from
* @param comparator the Comparator to use to provide ordering fo SAMRecords
*/
public ComparableSamRecordIterator(final SAMFileReader sam, final Comparator<SAMRecord> comparator) {
public ComparableSamRecordIterator( final SAMFileReader sam, final Comparator<SAMRecord> comparator ) {
super(sam.iterator());
this.reader = sam;
this.comparator = comparator;
}
public ComparableSamRecordIterator(final SAMFileReader sam, Iterator<SAMRecord> iterator, final Comparator<SAMRecord> comparator) {
public ComparableSamRecordIterator( final SAMFileReader sam, Iterator<SAMRecord> iterator, final Comparator<SAMRecord> comparator ) {
super(iterator); // use the provided iterator
this.reader = sam;
this.comparator = comparator;
@ -381,7 +411,7 @@ class ComparableSamRecordIterator extends PeekableIterator<SAMRecord> implements
*
* @return a negative, 0 or positive number as described in the Comparator interface
*/
public int compareTo(final ComparableSamRecordIterator that) {
public int compareTo( final ComparableSamRecordIterator that ) {
if (this.comparator.getClass() != that.comparator.getClass()) {
throw new IllegalStateException("Attempt to compare two ComparableSAMRecordIterators that " +
"have different orderings internally");

View File

@ -1,6 +1,5 @@
package org.broadinstitute.sting.gatk.iterators;
import net.sf.samtools.SAMRecord;
/*
* Copyright (c) 2009 The Broad Institute
@ -30,14 +29,12 @@ import net.sf.samtools.SAMRecord;
/**
* @author aaron
* <p/>
* Interface Peekable
* Class PeekingIterator
* <p/>
* This interface indicates that
* a peekable interface, that requires a peek() method
*/
public interface PeekingStingIterator extends StingSAMIterator {
/**
* peek, given the specified type
* @return
*/
SAMRecord peek();
public interface PeekingIterator<T> {
/** @return returns a peeked value */
public T peek();
}

View File

@ -0,0 +1,48 @@
package org.broadinstitute.sting.gatk.iterators;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.SAMRecordComparator;
import net.sf.samtools.SAMFileReader;
import java.util.Iterator;
/*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
/**
* @author aaron
*
* This interface indicates that the iterator is query able
*/
public interface QueryIterator extends StingSAMIterator, PeekingIterator<SAMRecord> {
/**
* The required methods to query able
**/
public void queryOverlapping(final String contig, final int start, final int stop);
public void query(final String contig, final int start, final int stop, final boolean contained);
public void queryUnmappedReads();
public void queryContained(final String contig, final int start, final int stop);
}

View File

@ -50,13 +50,13 @@ import java.math.BigInteger;
*
* A descriptions should go here. Blame aaron if it's missing.
*/
public class ReadValidationWalker extends ReadWalker<SAMRecord, Integer> {
public class ReadValidationWalker extends ReadWalker<SAMRecord, SAMRecord> {
// our MD5 sum
private MessageDigest m;
// private list of md5sums
private final List<BigInteger> list = new ArrayList<BigInteger>();
private final List<String> list = new ArrayList<String>();
/**
* The initialize function.
@ -94,8 +94,8 @@ public class ReadValidationWalker extends ReadWalker<SAMRecord, Integer> {
* bam file, if it was specified on the command line
* @return SAMFileWriter, set to the BAM output file if the command line option was set, null otherwise
*/
public Integer reduceInit() {
return new Integer(0);
public SAMRecord reduceInit() {
return null;
}
/**
@ -104,31 +104,16 @@ public class ReadValidationWalker extends ReadWalker<SAMRecord, Integer> {
* @param output the output source
* @return the SAMFileWriter, so that the next reduce can emit to the same source
*/
public Integer reduce( SAMRecord read, Integer output ) {
byte[] ray = new byte[read.getReadName().length() + 5];
int x = 0;
for (char c: read.getReadName().toCharArray()) {
ray[x] = (byte)c;
//System.err.println("adding " + c + " to pos " + x);
x++;
public SAMRecord reduce( SAMRecord read, SAMRecord output ) {
if (output == null)
return read;
if ((read.getReferenceIndex() == output.getReferenceIndex()) && (read.getAlignmentStart() < output.getAlignmentStart())) {
System.err.println("saw the read " + read.getReadName() + " duplicated, old alignment = " + output.getAlignmentStart());
}
//System.err.println(read.getReadName() + " name, alignment = " + read.getAlignmentStart());
int y = 0;
for (;y < 4; y++) {
ray[x+y] = (byte)((read.getAlignmentStart() >> y * 8) & 0x000f);
else if (read.getReferenceIndex() != output.getReferenceIndex()){
System.err.println("Switching Chromo");
}
ray[x+y] = read.getSecondOfPairFlag() ? (byte)1 : (byte)0;
BigInteger bigInt = new BigInteger(m.digest(ray));
//System.err.println(bigInt.toString());
m.reset();
if (this.list.contains(bigInt)) {
throw new StingException("Seen Read: " + bigInt + "-> " + read.getReadName() + " before (list size = " + list.size() + ")");
}
if (read.getAlignmentStart() < output) {
throw new StingException("Seen Read " + read.getReadName() + " has alignment of " + read.getAlignmentStart() + " before (list size = " + output + ")");
}
list.add(bigInt);
return read.getAlignmentStart();
return read;
}

View File

@ -1,9 +1,12 @@
package org.broadinstitute.sting.utils.sam;
import org.broadinstitute.sting.gatk.iterators.PeekingStingIterator;
import org.broadinstitute.sting.gatk.iterators.QueryIterator;
import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
import org.broadinstitute.sting.gatk.Reads;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.util.CloseableIterator;
import net.sf.picard.util.PeekableIterator;
import java.util.Iterator;
@ -34,7 +37,7 @@ import java.util.Iterator;
*/
/** this fake iterator allows us to look at how specific piles of reads are handled */
public class ArtificialSAMIterator implements PeekingStingIterator {
public class ArtificialSAMIterator implements StingSAMIterator {
protected int currentChromo = 0;

View File

@ -7,6 +7,7 @@ import net.sf.samtools.SAMRecord;
import java.util.List;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.gatk.iterators.QueryIterator;
/*
@ -41,7 +42,7 @@ import org.broadinstitute.sting.utils.StingException;
* to test out classes that use specific itervals. The reads returned will
* all lie in order in the specified interval.
*/
public class ArtificialSAMQueryIterator extends ArtificialSAMIterator {
public class ArtificialSAMQueryIterator extends ArtificialSAMIterator implements QueryIterator {
// get the next positon
protected int finalPos = 0;
@ -88,6 +89,39 @@ public class ArtificialSAMQueryIterator extends ArtificialSAMIterator {
initialize(contig, start, stop);
}
@Override
public void query( String contig, int start, int stop, boolean contained ) {
if (contained)
queryContained(contig, start, stop);
else
queryOverlapping(contig, start, stop);
}
@Override
public void queryUnmappedReads() {
initializeUnmapped();
}
/**
* initialize the iterator to an unmapped read position
*/
public void initializeUnmapped() {
ensureUntouched();
while (super.hasNext() && this.peek().getReferenceIndex() >= 0) {
super.next();
}
// sanity check that we have an actual matching read next
SAMRecord rec = this.peek();
if (rec == null) {
throw new StingException("The next read doesn't match");
}
// set the seeked variable to true
seeked = true;
}
/**
* initialize the query iterator
@ -124,7 +158,7 @@ public class ArtificialSAMQueryIterator extends ArtificialSAMIterator {
// sanity check that we have an actual matching read next
SAMRecord rec = this.peek();
if (!matches(rec)) {
throw new StingException("The next read doesn't match");
throw new StingException("The next read doesn't match");
}
// set the seeked variable to true
seeked = true;
@ -141,6 +175,11 @@ public class ArtificialSAMQueryIterator extends ArtificialSAMIterator {
if (rec.getReferenceIndex() != this.contigIndex) {
return false;
}
// if we have an unmapped read, matching the contig is good enough for us
if (rec.getReferenceIndex() < 0) {
return true;
}
if (!overlapping) {
// if the start or the end are somewhere within our range
if (( rec.getAlignmentStart() >= startPos && rec.getAlignmentEnd() <= finalPos )) {

View File

@ -5,8 +5,7 @@ import net.sf.samtools.*;
import java.io.File;
import java.util.*;
import org.broadinstitute.sting.gatk.iterators.PeekingStingIterator;
import org.broadinstitute.sting.gatk.Reads;
import org.broadinstitute.sting.gatk.iterators.QueryIterator;
import org.broadinstitute.sting.utils.StingException;
/**
@ -103,6 +102,51 @@ public class ArtificialSAMUtils {
return header;
}
/**
* setup a default read group for a SAMFileHeader
*
* @param header the header to set
* @param readGroupID the read group ID tag
* @param sampleName the sample name
*
* @return the adjusted SAMFileHeader
*/
public static SAMFileHeader createDefaultReadGroup( SAMFileHeader header, String readGroupID, String sampleName ) {
SAMReadGroupRecord rec = new SAMReadGroupRecord(readGroupID);
rec.setSample(sampleName);
List<SAMReadGroupRecord> readGroups = new ArrayList<SAMReadGroupRecord>();
readGroups.add(rec);
header.setReadGroups(readGroups);
return header;
}
/**
* setup read groups for the specified read groups and sample names
*
* @param header the header to set
* @param readGroupIDs the read group ID tags
* @param sampleNames the sample names
*
* @return the adjusted SAMFileHeader
*/
public static SAMFileHeader createEnumeratedReadGroups( SAMFileHeader header, List<String> readGroupIDs, List<String> sampleNames ) {
if (readGroupIDs.size() != sampleNames.size()) {
throw new StingException("read group count and sample name count must be the same");
}
List<SAMReadGroupRecord> readGroups = new ArrayList<SAMReadGroupRecord>();
int x = 0;
for (; x < readGroupIDs.size(); x++) {
SAMReadGroupRecord rec = new SAMReadGroupRecord(readGroupIDs.get(x));
rec.setSample(sampleNames.get(x));
readGroups.add(rec);
}
header.setReadGroups(readGroups);
return header;
}
/**
* Create an artificial read based on the parameters. The cigar string will be *M, where * is the length of the read
*
@ -115,7 +159,7 @@ public class ArtificialSAMUtils {
* @return the artificial read
*/
public static SAMRecord createArtificialRead( SAMFileHeader header, String name, int refIndex, int alignmentStart, int length ) {
if( alignmentStart == 0 )
if (alignmentStart == 0)
throw new StingException("Invalid alignment start for artificial read");
SAMRecord record = new SAMRecord(header);
record.setReadName(name);
@ -128,20 +172,26 @@ public class ArtificialSAMUtils {
return record;
}
/**
* create an iterator containing the specified read piles
* Create an artificial read based on the parameters. The cigar string will be *M, where * is the length of the read
*
* @param startingChr the chromosome (reference ID) to start from
* @param endingChr the id to end with
* @param readCount the number of reads per chromosome
* @param header the SAM header to associate the read with
* @param name the name of the read
* @param refIndex the reference index, i.e. what chromosome to associate it with
* @param alignmentStart where to start the alignment
* @param bases the sequence of the read
* @param qual the qualities of the read
*
* @return StingSAMIterator representing the specified amount of fake data
* @return the artificial read
*/
public static PeekingStingIterator unmappedReadIterator( int startingChr, int endingChr, int readCount ) {
SAMFileHeader header = createArtificialSamHeader(( endingChr - startingChr ) + 1, startingChr, readCount + DEFAULT_READ_LENGTH);
return new ArtificialSAMIterator(startingChr, endingChr, readCount, header);
public static SAMRecord createArtificialRead( SAMFileHeader header, String name, int refIndex, int alignmentStart, byte[] bases, byte[] qual ) {
if (bases.length != qual.length) {
throw new StingException("Passed in read string is different length then the quality array");
}
SAMRecord rec = createArtificialRead(header, name, refIndex, alignmentStart, bases.length);
rec.setReadBases(bases);
rec.setBaseQualities(bases);
return rec;
}
/**
@ -150,14 +200,29 @@ public class ArtificialSAMUtils {
* @param startingChr the chromosome (reference ID) to start from
* @param endingChr the id to end with
* @param readCount the number of reads per chromosome
*
* @return StingSAMIterator representing the specified amount of fake data
*/
public static QueryIterator unmappedReadIterator( int startingChr, int endingChr, int readCount ) {
SAMFileHeader header = createArtificialSamHeader(( endingChr - startingChr ) + 1, startingChr, readCount + DEFAULT_READ_LENGTH);
return new ArtificialSAMQueryIterator(startingChr, endingChr, readCount, 0, header);
}
/**
* create an iterator containing the specified read piles
*
* @param startingChr the chromosome (reference ID) to start from
* @param endingChr the id to end with
* @param readCount the number of reads per chromosome
* @param unmappedReadCount the count of unmapped reads to place at the end of the iterator, like in a sorted bam file
*
* @return StingSAMIterator representing the specified amount of fake data
*/
public static PeekingStingIterator unmappedReadIterator( int startingChr, int endingChr, int readCount, int unmappedReadCount ) {
public static QueryIterator unmappedReadIterator( int startingChr, int endingChr, int readCount, int unmappedReadCount ) {
SAMFileHeader header = createArtificialSamHeader(( endingChr - startingChr ) + 1, startingChr, readCount + DEFAULT_READ_LENGTH);
return new ArtificialSAMIterator(startingChr, endingChr, readCount, unmappedReadCount, header);
return new ArtificialSAMQueryIterator(startingChr, endingChr, readCount, unmappedReadCount, header);
}
/**
@ -171,16 +236,16 @@ public class ArtificialSAMUtils {
*/
public static ArtificialSAMQueryIterator queryReadIterator( int startingChr, int endingChr, int readCount ) {
SAMFileHeader header = createArtificialSamHeader(( endingChr - startingChr ) + 1, startingChr, readCount + DEFAULT_READ_LENGTH);
return new ArtificialSAMQueryIterator(startingChr, endingChr, readCount, 0, header);
}
/**
* create an ArtificialSAMQueryIterator containing the specified read piles
*
* @param startingChr the chromosome (reference ID) to start from
* @param endingChr the id to end with
* @param readCount the number of reads per chromosome
* @param startingChr the chromosome (reference ID) to start from
* @param endingChr the id to end with
* @param readCount the number of reads per chromosome
* @param unmappedReadCount the count of unmapped reads to place at the end of the iterator, like in a sorted bam file
*
* @return StingSAMIterator representing the specified amount of fake data

View File

@ -3,7 +3,6 @@ package org.broadinstitute.sting.gatk.datasources.simpleDataSources;
import static junit.framework.Assert.fail;
import net.sf.samtools.SAMFileReader;
import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.gatk.datasources.shards.ShardStrategy;
import org.broadinstitute.sting.gatk.datasources.shards.ShardStrategyFactory;
@ -84,7 +83,7 @@ public class SAMByReadsTest extends BaseTest {
SAMDataSource data = new SAMDataSource(reads,true);
for (int x = 0; x < 10; x++) {
++iterations;
PeekingStingIterator iter = ArtificialSAMUtils.unmappedReadIterator(1, 100, 10, 1000);
QueryIterator iter = ArtificialSAMUtils.unmappedReadIterator(1, 100, 10, 1000);
BoundedReadIterator ret = data.toUnmappedReads(100, iter);
// count the reads we've gotten back
if (ret == null) {
@ -173,7 +172,7 @@ public class SAMByReadsTest extends BaseTest {
while (shardStrategy.hasNext()) {
BoundedReadIterator ret = (BoundedReadIterator)data.seek(shardStrategy.next());
StingSAMIterator ret = data.seek(shardStrategy.next());
assertTrue(ret != null);
while (ret.hasNext()) {
ret.next();
@ -227,7 +226,7 @@ class ArtificialResourcePool extends SAMIteratorPool {
}
@Override
public StingSAMIterator iterator( GenomeLoc loc ) {
public QueryIterator iterator( GenomeLoc loc ) {
ArtificialSAMQueryIterator iter = ArtificialSAMUtils.queryReadIterator(1, 10, 100, 1000);
if (loc != null) {
iter.queryContained(loc.getContig(), (int)loc.getStart(), (int)loc.getStop());

View File

@ -2,7 +2,7 @@ package org.broadinstitute.sting.utils.sam;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
import org.broadinstitute.sting.gatk.iterators.PeekingStingIterator;
import org.broadinstitute.sting.gatk.iterators.QueryIterator;
import org.junit.Test;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.fail;
@ -80,30 +80,5 @@ public class ArtificialSAMUtilsTest extends BaseTest {
assertEquals(1000, count);
}
@Test
public void testPeeking() {
PeekingStingIterator iter = ArtificialSAMUtils.unmappedReadIterator(1, 100, 100);
int count = 0;
while (iter.hasNext()) {
int readCnt = ((ArtificialSAMIterator)(iter)).readsTaken();
// peek the record
SAMRecord rec = iter.peek();
assertTrue(rec.getReferenceIndex() >= 0);
// next the record
SAMRecord rec2 = iter.next();
assertTrue(rec2.getReadName() == rec.getReadName());
assertTrue(rec2.getAlignmentStart() == rec.getAlignmentStart());
// find out how many reads we've taken now
int readCnt2 = ((ArtificialSAMIterator)(iter)).readsTaken();
count++;
if (count < 100*100) assertEquals(readCnt + 1, readCnt2);
else assertEquals(readCnt, readCnt2);
}
assertEquals(100 * 100, count );
}
}