diff --git a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisTK.java b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisTK.java index 8b026b4ae..8d8276eb1 100644 --- a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisTK.java +++ b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisTK.java @@ -44,7 +44,9 @@ public class GenomeAnalysisTK extends CommandLineProgram { public String HAPMAP_FILE = null; public Boolean ENABLED_THREADED_IO = false; public Boolean UNSAFE = false; - public Boolean ENABLED_SORT_ON_FLY = false; + public String MAX_ON_FLY_SORTS = null; + public String DOWNSAMPLE_FRACTION = null; + public String DOWNSAMPLE_COVERAGE = null; public String INTERVALS_FILE = null; // our walker manager @@ -104,7 +106,9 @@ public class GenomeAnalysisTK extends CommandLineProgram { m_parser.addOptionalArg("Hapmap", "H", "Hapmap file", "HAPMAP_FILE"); m_parser.addOptionalFlag("threaded_IO", "P", "If set, enables threaded I/O operations", "ENABLED_THREADED_IO"); m_parser.addOptionalFlag("unsafe", "U", "If set, enables unsafe operations, nothing will be checked at runtime.", "UNSAFE"); - m_parser.addOptionalFlag("sort_on_the_fly", "F", "If set, enables on fly sorting of reads file.", "ENABLED_SORT_ON_FLY"); + m_parser.addOptionalArg("sort_on_the_fly", "sort", "Maximum number of reads to sort on the fly", "MAX_ON_FLY_SORTS"); + m_parser.addOptionalArg("downsample_to_fraction", "dfrac", "Fraction of reads to downsample to", "DOWNSAMPLE_FRACTION"); + m_parser.addOptionalArg("downsample_to_coverage", "dcov", "Coverage to downsample to", "DOWNSAMPLE_COVERAGE"); m_parser.addOptionalArg("intervals_file", "V", "File containing list of genomic intervals to operate on. line := ", "INTERVALS_FILE"); m_parser.addOptionalArg("all_loci", "A", "Should we process all loci, not just those covered by reads", "WALK_ALL_LOCI"); m_parser.addOptionalArg("out", "o", "An output file presented to the walker. Will overwrite contents if file exists.", "outFileName" ); @@ -228,8 +232,19 @@ public class GenomeAnalysisTK extends CommandLineProgram { engine.setLocationFromFile(INTERVALS_FILE); } + if (MAX_ON_FLY_SORTS != null) { + engine.setSortOnFly(Integer.parseInt(MAX_ON_FLY_SORTS)); + } + + if (DOWNSAMPLE_FRACTION != null) { + engine.setDownsampleByFraction(Double.parseDouble(DOWNSAMPLE_FRACTION)); + } + + if (DOWNSAMPLE_COVERAGE != null) { + ; // TO DO: (Double.parseDouble(DOWNSAMPLE_COVERAGE)); + } + engine.setSafetyChecking(!UNSAFE); - engine.setSortOnFly(ENABLED_SORT_ON_FLY); engine.setThreadedIO(ENABLED_THREADED_IO); engine.setWalkOverAllSites(WALK_ALL_LOCI); engine.initialize(); diff --git a/java/src/org/broadinstitute/sting/gatk/iterators/DownsampleIterator.java b/java/src/org/broadinstitute/sting/gatk/iterators/DownsampleIterator.java new file mode 100755 index 000000000..75f9765fd --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/iterators/DownsampleIterator.java @@ -0,0 +1,49 @@ +package org.broadinstitute.sting.gatk.iterators; + +import net.sf.samtools.SAMRecord; + +import java.util.Random; +import java.util.Iterator; + +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.Utils; + + +public class DownsampleIterator implements Iterator { + + Iterator it; + Random generator; + int cutoff; + SAMRecord next; + + public DownsampleIterator(Iterator it, double fraction) { + this.it = it; + generator = new Random(); + cutoff = (int)(fraction * 10000); + next = getNextRecord(); + } + + public boolean hasNext() { + return next != null; + } + + public SAMRecord next() { + SAMRecord result = next; + next = getNextRecord(); + return result; + } + + public void remove() { + throw new UnsupportedOperationException("Can not remove records from a SAM file via an iterator!"); + } + + private SAMRecord getNextRecord() { + while ( true ) { + if ( !it.hasNext() ) + return null; + SAMRecord rec = it.next(); + if ( generator.nextInt(10000) < cutoff ) + return rec; + } + } +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/iterators/SortSamIterator.java b/java/src/org/broadinstitute/sting/gatk/iterators/SortSamIterator.java index 7b63f4c14..1de48cfcb 100755 --- a/java/src/org/broadinstitute/sting/gatk/iterators/SortSamIterator.java +++ b/java/src/org/broadinstitute/sting/gatk/iterators/SortSamIterator.java @@ -1,10 +1,7 @@ package org.broadinstitute.sting.gatk.iterators; import net.sf.samtools.SAMRecord; -import net.sf.samtools.SAMFileReader; -import net.sf.samtools.util.RuntimeIOException; -import java.util.Arrays; import java.util.ArrayList; import java.util.Collections; import java.util.Iterator; diff --git a/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java b/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java index 44cb226ef..22fe74310 100755 --- a/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java +++ b/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java @@ -79,9 +79,11 @@ public abstract class TraversalEngine { protected boolean DEBUGGING = false; protected boolean beSafeP = true; protected boolean SORT_ON_FLY = false; + protected boolean DOWNSAMPLE_BY_FRACTION = false; protected boolean FILTER_UNSORTED_READS = false; protected boolean walkOverAllSites = false; - protected int MAX_ON_FLY_SORTS = 100000; + protected int maxOnFlySorts = 100000; + protected double downsamplingFraction = 1.0; protected long N_RECORDS_TO_PRINT = 100000; protected boolean THREADED_IO = false; protected int THREADED_IO_BUFFER_SIZE = 10000; @@ -153,10 +155,18 @@ public abstract class TraversalEngine { this.FILTER_UNSORTED_READS = filterUnsorted; } - public void setSortOnFly(final boolean SORT_ON_FLY) { - if (SORT_ON_FLY) - logger.info("Sorting read file on the fly: max reads allowed is " + MAX_ON_FLY_SORTS); - this.SORT_ON_FLY = SORT_ON_FLY; + public void setSortOnFly(final int maxReadsToSort) { + logger.info("Sorting read file on the fly: max reads allowed is " + maxReadsToSort); + SORT_ON_FLY = true; + maxOnFlySorts = maxReadsToSort; + } + + public void setSortOnFly() { setSortOnFly(100000); } + + public void setDownsampleByFraction(final double fraction) { + logger.info("Downsampling to approximately " + (fraction * 100.0) + "% of filtered reads"); + DOWNSAMPLE_BY_FRACTION = true; + downsamplingFraction = fraction; } // -------------------------------------------------------------------------------------------------------------- @@ -337,8 +347,13 @@ public abstract class TraversalEngine { protected Iterator WrapReadsIterator( final Iterator rawIterator, final boolean enableVerification ) { Iterator wrappedIterator = rawIterator; + // NOTE: this (and other filtering) should be done before on-the-fly sorting + // as there is no reason to sort something that we will end of throwing away + if (DOWNSAMPLE_BY_FRACTION) + wrappedIterator = new DownsampleIterator(wrappedIterator, downsamplingFraction); + if (SORT_ON_FLY) - wrappedIterator = new SortSamIterator(wrappedIterator, MAX_ON_FLY_SORTS); + wrappedIterator = new SortSamIterator(wrappedIterator, maxOnFlySorts); if (beSafeP && enableVerification) wrappedIterator = new VerifyingSamIterator(wrappedIterator);