Support for dynamically merging data files. Preliminary only -- everything in these systems is still being tested

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@200 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
depristo 2009-03-26 14:40:50 +00:00
parent 2808fd4bbd
commit 7cad3acc61
4 changed files with 336 additions and 5 deletions

View File

@ -3,6 +3,7 @@ package org.broadinstitute.sting.gatk;
import edu.mit.broad.picard.filter.FilteringIterator;
import edu.mit.broad.picard.filter.SamRecordFilter;
import edu.mit.broad.picard.reference.ReferenceSequence;
import edu.mit.broad.picard.sam.SamFileHeaderMerger;
import net.sf.functionalj.Function1;
import net.sf.functionalj.FunctionN;
import net.sf.functionalj.Functions;
@ -327,6 +328,9 @@ public class TraversalEngine {
protected <T> void printOnTraversalDone(final String type, T sum) {
printProgress(true, type, null);
System.out.println("Traversal reduce result is " + sum); // TODO: fixme -- how do we use this logger?
final long curTime = System.currentTimeMillis();
final double elapsed = (curTime - startTime) / 1000.0;
logger.info(String.format("Total runtime %.2f secs, %.2f min, %.2f hours%n", elapsed, elapsed / 60, elapsed / 3600));
logger.info(String.format("Traversal skipped %d reads out of %d total (%.2f%%)", nSkippedReads, nReads, (nSkippedReads * 100.0) / nReads));
logger.info(String.format(" -> %d unmapped reads", nUnmappedReads));
logger.info(String.format(" -> %d non-primary reads", nNotPrimary));
@ -361,6 +365,23 @@ public class TraversalEngine {
private Iterator<SAMRecord> getReadsIterator(final SAMFileReader samReader) {
// If the file has an index, querying functions are available. Use them if possible...
if ( samReader == null && readsFile.toString().endsWith(".list") ) {
SAMFileHeader.SortOrder SORT_ORDER = SAMFileHeader.SortOrder.coordinate;
List<SAMFileReader> readers = new ArrayList<SAMFileReader>();
try {
for ( String fileName : new xReadLines(readsFile) ) {
SAMFileReader reader = initializeSAMFile(new File(fileName));
readers.add(reader);
}
SamFileHeaderMerger headerMerger = new SamFileHeaderMerger(readers, SORT_ORDER);
return new MergingSamRecordIterator2(headerMerger);
}
catch ( FileNotFoundException e ) {
logger.fatal("Couldn't open file in sam file list: " + readsFile);
}
}
if (samReader.hasIndex()) {
return new SamQueryIterator(samReader, locs);
} else {
@ -386,13 +407,19 @@ public class TraversalEngine {
}
private SAMFileReader initializeSAMFile(final File samFile) {
SAMFileReader samReader = new SAMFileReader(samFile, true);
samReader.setValidationStringency(strictness);
// todo: fixme, this is a hack to try out dynamic merging
if ( samFile.toString().endsWith(".list") ) {
return null;
// todo: omg, this is just scary, just it's just for testing purposes. fix with the new DataSource system
} else {
SAMFileReader samReader = new SAMFileReader(samFile, true);
samReader.setValidationStringency(strictness);
final SAMFileHeader header = samReader.getFileHeader();
logger.info(String.format("Sort order is: " + header.getSortOrder()));
final SAMFileHeader header = samReader.getFileHeader();
logger.info(String.format("Sort order is: " + header.getSortOrder()));
return samReader;
return samReader;
}
}
// cleaning up past mistakes

View File

@ -0,0 +1,240 @@
/*
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever.
* Neither the Broad Institute nor MIT can be responsible for its use, misuse, or
* functionality.
*/
package org.broadinstitute.sting.gatk.iterators;
import edu.mit.broad.picard.sam.SamFileHeaderMerger;
import edu.mit.broad.picard.sam.ReservedTagConstants;
import edu.mit.broad.picard.PicardException;
import edu.mit.broad.picard.util.PeekableIterator;
import net.sf.samtools.*;
/*
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever.
* Neither the Broad Institute nor MIT can be responsible for its use, misuse, or
* functionality.
*/
import java.util.*;
import java.lang.reflect.Constructor;
import org.broadinstitute.sting.playground.gatk.iterators.SeekableSamIteration;
/**
* Provides an iterator interface for merging multiple underlying iterators into a single
* iterable stream. The underlying iterators/files must all have the same sort order unless
* the requested output format is unsorted, in which case any combination is valid.
*/
public class MergingSamRecordIterator2 implements Iterator<SAMRecord>, SeekableSamIteration {
protected PriorityQueue<ComparableSamRecordIterator> pq;
protected final SamFileHeaderMerger samHeaderMerger;
protected final SAMFileHeader.SortOrder sortOrder;
/**
* Constructs a new merging iterator with the same set of readers and sort order as
* provided by the header merger parameter.
*/
public MergingSamRecordIterator2(final SamFileHeaderMerger headerMerger) {
this.samHeaderMerger = headerMerger;
this.sortOrder = headerMerger.getMergedHeader().getSortOrder();
initializePQ();
final SAMRecordComparator comparator = getComparator();
for (final SAMFileReader reader : samHeaderMerger.getReaders()) {
if (this.sortOrder != SAMFileHeader.SortOrder.unsorted && reader.getFileHeader().getSortOrder() != this.sortOrder){
throw new PicardException("Files are not compatible with sort order: " + this.sortOrder );
}
final ComparableSamRecordIterator iterator = new ComparableSamRecordIterator(reader, comparator);
addIfNotEmpty(iterator);
}
}
protected void initializePQ() {
this.pq = new PriorityQueue<ComparableSamRecordIterator>(samHeaderMerger.getReaders().size());
}
public boolean supportsSeeking() { return true; }
public void queryOverlapping( final String contig, final int start, final int stop ) {
initializePQ(); // reinitialize the system
final SAMRecordComparator comparator = getComparator();
for (final SAMFileReader reader : samHeaderMerger.getReaders()) {
Iterator<SAMRecord> recordIter = reader.queryOverlapping( contig, start, stop);
final ComparableSamRecordIterator iterator = new ComparableSamRecordIterator(reader, recordIter, comparator);
addIfNotEmpty(iterator);
}
}
public void query(final String contig, final int start, final int stop, final boolean contained) {
initializePQ(); // reinitialize the system
final SAMRecordComparator comparator = getComparator();
for (final SAMFileReader reader : samHeaderMerger.getReaders()) {
Iterator<SAMRecord> recordIter = reader.query( contig, start, stop, contained);
final ComparableSamRecordIterator iterator = new ComparableSamRecordIterator(reader, recordIter, comparator);
addIfNotEmpty(iterator);
}
}
public void queryContained(final String contig, final int start, final int stop) {
initializePQ(); // reinitialize the system
final SAMRecordComparator comparator = getComparator();
for (final SAMFileReader reader : samHeaderMerger.getReaders()) {
Iterator<SAMRecord> recordIter = reader.queryContained( contig, start, stop );
final ComparableSamRecordIterator iterator = new ComparableSamRecordIterator(reader, recordIter, comparator);
addIfNotEmpty(iterator);
}
}
/** Returns true if any of the underlying iterators has more records, otherwise false. */
public boolean hasNext() {
return !this.pq.isEmpty();
}
/** Returns the next record from the top most iterator during merging. */
public SAMRecord next() {
final ComparableSamRecordIterator iterator = this.pq.poll();
final SAMRecord record = iterator.next();
addIfNotEmpty(iterator);
if (this.samHeaderMerger.hasGroupIdDuplicates()) {
final String id = (String) record.getAttribute(ReservedTagConstants.READ_GROUP_ID);
final String newId = this.samHeaderMerger.getReadGroupId(iterator.getReader(), id);
record.setAttribute(ReservedTagConstants.READ_GROUP_ID, newId);
}
final String oldProgramGroupId = (String) record.getAttribute(SAMTag.PG.toString());
if (oldProgramGroupId != null) {
final String newProgramGroupId = this.samHeaderMerger.getProgramGroupId(iterator.getReader(), oldProgramGroupId);
record.setAttribute(SAMTag.PG.toString(), newProgramGroupId);
}
//System.out.printf("NEXT = %s %s %d%n", record.getReadName(), record.getReferenceName(), record.getAlignmentStart());
//System.out.printf("PEEK = %s %s %d%n", this.pq.peek().peek().getReadName(), this.pq.peek().peek().getReferenceName(), this.pq.peek().peek().getAlignmentStart());
return record;
}
/**
* Adds iterator to priority queue. If the iterator has more records it is added
* otherwise it is closed and not added.
*/
protected void addIfNotEmpty(final ComparableSamRecordIterator iterator) {
//System.out.printf("Adding %s %s %d%n", iterator.peek().getReadName(), iterator.peek().getReferenceName(), iterator.peek().getAlignmentStart());
if (iterator.hasNext()) {
pq.offer(iterator);
}
else {
iterator.close();
}
}
/** Unsupported operation. */
public void remove() {
throw new UnsupportedOperationException("MergingSAMRecorderIterator.remove()");
}
/**
* Get the right comparator for a given sort order (coordinate, alphabetic). In the
* case of "unsorted" it will return a comparator that gives an arbitrary but reflexive
* ordering.
*/
protected SAMRecordComparator getComparator() {
// For unsorted build a fake comparator that compares based on object ID
if (this.sortOrder == SAMFileHeader.SortOrder.unsorted) {
return new SAMRecordComparator() {
public int fileOrderCompare(final SAMRecord lhs, final SAMRecord rhs) {
return System.identityHashCode(lhs) - System.identityHashCode(rhs);
}
public int compare(final SAMRecord lhs, final SAMRecord rhs) {
return fileOrderCompare(lhs, rhs);
}
};
}
// Otherwise try and figure out what kind of comparator to return and build it
final Class<? extends SAMRecordComparator> type = this.sortOrder.getComparator();
try {
final Constructor<? extends SAMRecordComparator> ctor = type.getConstructor(SAMFileHeader.class);
//System.out.printf("Getting comparator %s%n", ctor.toGenericString());
return ctor.newInstance(this.samHeaderMerger.getMergedHeader());
}
catch (Exception e) {
try {
final Constructor<? extends SAMRecordComparator> ctor = type.getConstructor();
return ctor.newInstance();
}
catch (Exception e2) {
throw new PicardException("Could not instantiate a comparator for sort order: " + this.sortOrder, e2);
}
}
}
/** Returns the merged header that the merging iterator is working from. */
public SAMFileHeader getMergedHeader() {
return this.samHeaderMerger.getMergedHeader();
}
}
// Should replace picard class with the same name
class ComparableSamRecordIterator extends PeekableIterator<SAMRecord> implements Comparable<ComparableSamRecordIterator> {
private final Comparator<SAMRecord> comparator;
private final SAMFileReader reader;
/**
* Constructs an iterator for iteration over the supplied SAM file that will be
* able to compare itself to other ComparableSAMRecordIterator instances using
* the supplied comparator for ordering SAMRecords.
*
* @param sam the SAM file to read records from
* @param comparator the Comparator to use to provide ordering fo SAMRecords
*/
public ComparableSamRecordIterator(final SAMFileReader sam, final Comparator<SAMRecord> comparator) {
super(sam.iterator());
this.reader = sam;
this.comparator = comparator;
}
public ComparableSamRecordIterator(final SAMFileReader sam, Iterator<SAMRecord> iterator, final Comparator<SAMRecord> comparator) {
super(iterator); // use the provided iterator
this.reader = sam;
this.comparator = comparator;
}
/** Returns the reader from which this iterator was constructed. */
public SAMFileReader getReader() {
return reader;
}
/**
* Compares this iterator to another comparable iterator based on the next record
* available in each iterator. If the two comparable iterators have different
* comparator types internally an exception is thrown.
*
* @param that another iterator to compare to
* @return a negative, 0 or positive number as described in the Comparator interface
*/
public int compareTo(final ComparableSamRecordIterator that) {
if (this.comparator.getClass() != that.comparator.getClass()) {
throw new IllegalStateException("Attempt to compare two ComparableSAMRecordIterators that " +
"have different orderings internally");
}
final SAMRecord record = this.peek();
final SAMRecord record2 = that.peek();
//System.out.printf("Comparing %s vs. %s => %d%n", record.getReadName(), record2.getReadName(), comparator.compare(record, record2));
return comparator.compare(record, record2);
}
}

View File

@ -0,0 +1,21 @@
package org.broadinstitute.sting.playground.gatk.iterators;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.SAMFileReader;
import net.sf.samtools.SAMRecordComparator;
import java.util.Iterator;
/**
* Created by IntelliJ IDEA.
* User: mdepristo
* Date: Mar 26, 2009
* Time: 9:27:59 AM
* To change this template use File | Settings | File Templates.
*/
public interface SeekableSamIteration {
public boolean supportsSeeking();
public void queryOverlapping( final String contig, final int start, final int stop );
public void query(final String contig, final int start, final int stop, final boolean contained);
public void queryContained(final String contig, final int start, final int stop);
}

View File

@ -0,0 +1,43 @@
package org.broadinstitute.sting.playground.gatk.iterators;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.SAMFileReader;
import java.util.Iterator;
import org.broadinstitute.sting.playground.gatk.iterators.SeekableSamIteration;
/**
* Created by IntelliJ IDEA.
* User: depristo
* Date: Feb 24, 2009
* Time: 10:24:38 AM
* To change this template use File | Settings | File Templates.
*/
public class SeekableSamIterator implements Iterator<SAMRecord>, SeekableSamIteration {
protected Iterator<SAMRecord> it;
protected SAMFileReader reader;
public SeekableSamIterator(Iterator<SAMRecord> it, SAMFileReader reader) {
this.it = it;
this.reader = reader;
}
public boolean supportsSeeking() { return true; }
public void queryOverlapping( final String contig, final int start, final int stop ) {
this.it = reader.queryOverlapping( contig, start, stop );
}
public void query(final String contig, final int start, final int stop, final boolean contained) {
this.it = reader.query( contig, start, stop, contained );
}
public void queryContained(final String contig, final int start, final int stop) {
this.it = reader.queryContained( contig, start, stop );
}
public boolean hasNext() { return it.hasNext(); }
public SAMRecord next() { return it.next(); }
public void remove () { it.remove(); }
}