diff --git a/playground/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisTK.java b/playground/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisTK.java index b08f633cb..c3f325c71 100644 --- a/playground/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisTK.java +++ b/playground/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisTK.java @@ -26,6 +26,7 @@ public class GenomeAnalysisTK extends CommandLineProgram { @Option(shortName="DBSNP", doc="DBSNP file", optional=true) public String DBSNP_FILE = null; @Option(shortName="THREADED_IO", doc="If true, enables threaded I/O operations", optional=true) public String ENABLED_THREADED_IO = "false"; @Option(shortName="U", doc="If true, enables unsafe operations, nothing will be checked at runtime. You better know what you are doing if you set this flag.", optional=false) public String UNSAFE = "false"; + @Option(shortName="SORT_ON_FLY", doc="If true, enables on fly sorting of reads file.", optional=false) public String ENABLED_SORT_ON_FLY = "false"; public static HashMap MODULES = new HashMap(); public static void addModule(final String name, final Object walker) { @@ -103,6 +104,7 @@ public class GenomeAnalysisTK extends CommandLineProgram { } engine.setSafetyChecking(! UNSAFE.toLowerCase().equals("true")); + engine.setSortOnFly(! ENABLED_SORT_ON_FLY.toLowerCase().equals("true")); engine.initialize(ENABLED_THREADED_IO.toLowerCase().equals("true")); //engine.testReference(); diff --git a/playground/java/src/org/broadinstitute/sting/gatk/TraversalEngine.java b/playground/java/src/org/broadinstitute/sting/gatk/TraversalEngine.java index 2c68271d9..225cac013 100755 --- a/playground/java/src/org/broadinstitute/sting/gatk/TraversalEngine.java +++ b/playground/java/src/org/broadinstitute/sting/gatk/TraversalEngine.java @@ -77,6 +77,8 @@ public class TraversalEngine { public boolean DEBUGGING = false; public boolean beSafeP = true; + public boolean SORT_ON_FLY = false; + public int MAX_ON_FLY_SORTS = 100000; public long N_RECORDS_TO_PRINT = 100000; public int THREADED_IO_BUFFER_SIZE = 10000; @@ -117,6 +119,11 @@ public class TraversalEngine { System.out.printf("*** Turning off safety checking, I hope you know what you are doing. Errors will result in debugging assert failures and other inscrutable messages...%n"); this.beSafeP = beSafeP; } + public void setSortOnFly( final boolean SORT_ON_FLY ) { + if ( SORT_ON_FLY ) + System.out.println("Sorting read file on the fly: max reads allowed is " + MAX_ON_FLY_SORTS); + this.SORT_ON_FLY = SORT_ON_FLY; + } // -------------------------------------------------------------------------------------------------------------- // @@ -306,9 +313,11 @@ public class TraversalEngine { throw new RuntimeIOException(ex); } - if ( beSafeP ) + if ( SORT_ON_FLY ) + samReadIter = new SortSamIterator(samReadIter, MAX_ON_FLY_SORTS); + else if ( beSafeP ) samReadIter = new VerifyingSamIterator(samReadIter); - + if ( THREADED_IO ) { System.out.printf("Enabling threaded I/O with buffer of %d reads%n", THREADED_IO_BUFFER_SIZE); samReadIter = new ThreadedIterator(samReadIter, THREADED_IO_BUFFER_SIZE); @@ -455,7 +464,7 @@ public class TraversalEngine { } public void verifySortOrder(final boolean requiresSortedOrder) { - if ( beSafeP && samReader.getFileHeader().getSortOrder() != SAMFileHeader.SortOrder.coordinate ) { + if ( beSafeP && !SORT_ON_FLY && samReader.getFileHeader().getSortOrder() != SAMFileHeader.SortOrder.coordinate ) { final String msg = "SAM file is not sorted in coordinate order (according to header) Walker type with given arguments requires a sorted file for correct processing"; if ( requiresSortedOrder || strictness == SAMFileReader.ValidationStringency.STRICT ) throw new RuntimeIOException(msg); diff --git a/playground/java/src/org/broadinstitute/sting/gatk/iterators/SortSamIterator.java b/playground/java/src/org/broadinstitute/sting/gatk/iterators/SortSamIterator.java new file mode 100755 index 000000000..7b63f4c14 --- /dev/null +++ b/playground/java/src/org/broadinstitute/sting/gatk/iterators/SortSamIterator.java @@ -0,0 +1,64 @@ +package org.broadinstitute.sting.gatk.iterators; + +import net.sf.samtools.SAMRecord; +import net.sf.samtools.SAMFileReader; +import net.sf.samtools.util.RuntimeIOException; + +import java.util.Arrays; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Iterator; + +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.Utils; + +/** + * Created by IntelliJ IDEA. + * User: mdepristo + * Date: Mar 15, 2009 + * Time: 6:02:31 PM + * To change this template use File | Settings | File Templates. + */ +public class SortSamIterator implements Iterator { + + Iterator it; + + public SortSamIterator(Iterator unsortedIter, int maxSorts) { + + ArrayList list = new ArrayList(); + while (unsortedIter.hasNext()) { + list.add(new ComparableSAMRecord(unsortedIter.next())); + // choose an arbitrary length to limit sorting for now + if (list.size() > maxSorts) + throw new UnsupportedOperationException("Can not sort files with more than 100K reads on the fly!"); + } + Collections.sort(list); + it = list.iterator(); + } + + public boolean hasNext() { return it.hasNext(); } + public SAMRecord next() { return it.next().getRecord(); } + + public void remove() { + throw new UnsupportedOperationException("Can not remove records from a SAM file via an iterator!"); + } + + private class ComparableSAMRecord implements Comparable { + + private SAMRecord record; + + public ComparableSAMRecord(SAMRecord record) { + this.record = record; + } + + public SAMRecord getRecord() { + return record; + } + + public int compareTo(ComparableSAMRecord o) { + GenomeLoc myLoc = Utils.genomicLocationOf(record); + GenomeLoc hisLoc = Utils.genomicLocationOf(o.getRecord()); + return myLoc.compareTo(hisLoc); + } + } +}