Added ability to sort reads on the fly

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@83 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
ebanks 2009-03-17 20:29:09 +00:00
parent 0362cb9e59
commit 1aa3958644
3 changed files with 78 additions and 3 deletions

View File

@ -26,6 +26,7 @@ public class GenomeAnalysisTK extends CommandLineProgram {
@Option(shortName="DBSNP", doc="DBSNP file", optional=true) public String DBSNP_FILE = null;
@Option(shortName="THREADED_IO", doc="If true, enables threaded I/O operations", optional=true) public String ENABLED_THREADED_IO = "false";
@Option(shortName="U", doc="If true, enables unsafe operations, nothing will be checked at runtime. You better know what you are doing if you set this flag.", optional=false) public String UNSAFE = "false";
@Option(shortName="SORT_ON_FLY", doc="If true, enables on fly sorting of reads file.", optional=false) public String ENABLED_SORT_ON_FLY = "false";
public static HashMap<String, Object> MODULES = new HashMap<String,Object>();
public static void addModule(final String name, final Object walker) {
@ -103,6 +104,7 @@ public class GenomeAnalysisTK extends CommandLineProgram {
}
engine.setSafetyChecking(! UNSAFE.toLowerCase().equals("true"));
engine.setSortOnFly(! ENABLED_SORT_ON_FLY.toLowerCase().equals("true"));
engine.initialize(ENABLED_THREADED_IO.toLowerCase().equals("true"));
//engine.testReference();

View File

@ -77,6 +77,8 @@ public class TraversalEngine {
public boolean DEBUGGING = false;
public boolean beSafeP = true;
public boolean SORT_ON_FLY = false;
public int MAX_ON_FLY_SORTS = 100000;
public long N_RECORDS_TO_PRINT = 100000;
public int THREADED_IO_BUFFER_SIZE = 10000;
@ -117,6 +119,11 @@ public class TraversalEngine {
System.out.printf("*** Turning off safety checking, I hope you know what you are doing. Errors will result in debugging assert failures and other inscrutable messages...%n");
this.beSafeP = beSafeP;
}
public void setSortOnFly( final boolean SORT_ON_FLY ) {
if ( SORT_ON_FLY )
System.out.println("Sorting read file on the fly: max reads allowed is " + MAX_ON_FLY_SORTS);
this.SORT_ON_FLY = SORT_ON_FLY;
}
// --------------------------------------------------------------------------------------------------------------
//
@ -306,9 +313,11 @@ public class TraversalEngine {
throw new RuntimeIOException(ex);
}
if ( beSafeP )
if ( SORT_ON_FLY )
samReadIter = new SortSamIterator(samReadIter, MAX_ON_FLY_SORTS);
else if ( beSafeP )
samReadIter = new VerifyingSamIterator(samReadIter);
if ( THREADED_IO ) {
System.out.printf("Enabling threaded I/O with buffer of %d reads%n", THREADED_IO_BUFFER_SIZE);
samReadIter = new ThreadedIterator<SAMRecord>(samReadIter, THREADED_IO_BUFFER_SIZE);
@ -455,7 +464,7 @@ public class TraversalEngine {
}
public void verifySortOrder(final boolean requiresSortedOrder) {
if ( beSafeP && samReader.getFileHeader().getSortOrder() != SAMFileHeader.SortOrder.coordinate ) {
if ( beSafeP && !SORT_ON_FLY && samReader.getFileHeader().getSortOrder() != SAMFileHeader.SortOrder.coordinate ) {
final String msg = "SAM file is not sorted in coordinate order (according to header) Walker type with given arguments requires a sorted file for correct processing";
if ( requiresSortedOrder || strictness == SAMFileReader.ValidationStringency.STRICT )
throw new RuntimeIOException(msg);

View File

@ -0,0 +1,64 @@
package org.broadinstitute.sting.gatk.iterators;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.SAMFileReader;
import net.sf.samtools.util.RuntimeIOException;
import java.util.Arrays;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Iterator;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.Utils;
/**
* Created by IntelliJ IDEA.
* User: mdepristo
* Date: Mar 15, 2009
* Time: 6:02:31 PM
* To change this template use File | Settings | File Templates.
*/
public class SortSamIterator implements Iterator<SAMRecord> {
Iterator<ComparableSAMRecord> it;
public SortSamIterator(Iterator<SAMRecord> unsortedIter, int maxSorts) {
ArrayList<ComparableSAMRecord> list = new ArrayList<ComparableSAMRecord>();
while (unsortedIter.hasNext()) {
list.add(new ComparableSAMRecord(unsortedIter.next()));
// choose an arbitrary length to limit sorting for now
if (list.size() > maxSorts)
throw new UnsupportedOperationException("Can not sort files with more than 100K reads on the fly!");
}
Collections.sort(list);
it = list.iterator();
}
public boolean hasNext() { return it.hasNext(); }
public SAMRecord next() { return it.next().getRecord(); }
public void remove() {
throw new UnsupportedOperationException("Can not remove records from a SAM file via an iterator!");
}
private class ComparableSAMRecord implements Comparable<ComparableSAMRecord> {
private SAMRecord record;
public ComparableSAMRecord(SAMRecord record) {
this.record = record;
}
public SAMRecord getRecord() {
return record;
}
public int compareTo(ComparableSAMRecord o) {
GenomeLoc myLoc = Utils.genomicLocationOf(record);
GenomeLoc hisLoc = Utils.genomicLocationOf(o.getRecord());
return myLoc.compareTo(hisLoc);
}
}
}