diff --git a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisTK.java b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisTK.java index 8d8276eb1..95102c7d5 100644 --- a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisTK.java +++ b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisTK.java @@ -107,8 +107,8 @@ public class GenomeAnalysisTK extends CommandLineProgram { m_parser.addOptionalFlag("threaded_IO", "P", "If set, enables threaded I/O operations", "ENABLED_THREADED_IO"); m_parser.addOptionalFlag("unsafe", "U", "If set, enables unsafe operations, nothing will be checked at runtime.", "UNSAFE"); m_parser.addOptionalArg("sort_on_the_fly", "sort", "Maximum number of reads to sort on the fly", "MAX_ON_FLY_SORTS"); - m_parser.addOptionalArg("downsample_to_fraction", "dfrac", "Fraction of reads to downsample to", "DOWNSAMPLE_FRACTION"); - m_parser.addOptionalArg("downsample_to_coverage", "dcov", "Coverage to downsample to", "DOWNSAMPLE_COVERAGE"); + m_parser.addOptionalArg("downsample_to_fraction", "dfrac", "Fraction [0.0-1.0] of reads to downsample to", "DOWNSAMPLE_FRACTION"); + m_parser.addOptionalArg("downsample_to_coverage", "dcov", "Coverage [integer] to downsample to", "DOWNSAMPLE_COVERAGE"); m_parser.addOptionalArg("intervals_file", "V", "File containing list of genomic intervals to operate on. line := ", "INTERVALS_FILE"); m_parser.addOptionalArg("all_loci", "A", "Should we process all loci, not just those covered by reads", "WALK_ALL_LOCI"); m_parser.addOptionalArg("out", "o", "An output file presented to the walker. Will overwrite contents if file exists.", "outFileName" ); @@ -241,7 +241,7 @@ public class GenomeAnalysisTK extends CommandLineProgram { } if (DOWNSAMPLE_COVERAGE != null) { - ; // TO DO: (Double.parseDouble(DOWNSAMPLE_COVERAGE)); + engine.setDownsampleByCoverage(Integer.parseInt(DOWNSAMPLE_COVERAGE)); } engine.setSafetyChecking(!UNSAFE); diff --git a/java/src/org/broadinstitute/sting/gatk/LocusContext.java b/java/src/org/broadinstitute/sting/gatk/LocusContext.java index c17815ac2..11d66be49 100755 --- a/java/src/org/broadinstitute/sting/gatk/LocusContext.java +++ b/java/src/org/broadinstitute/sting/gatk/LocusContext.java @@ -3,7 +3,10 @@ package org.broadinstitute.sting.gatk; import net.sf.samtools.SAMRecord; import java.util.List; -import java.lang.ref.Reference; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.TreeSet; +import java.util.Random; import org.broadinstitute.sting.utils.GenomeLoc; import edu.mit.broad.picard.reference.ReferenceSequence; @@ -98,4 +101,37 @@ public class LocusContext { public void setReferenceContig(final ReferenceSequence contig) { refContig = contig; } + + public void downsampleToCoverage(int coverage) { + if ( numReads() <= coverage ) + return; + + // randomly choose numbers corresponding to positions in the reads list + Random generator = new Random(); + TreeSet positions = new TreeSet(); + int i = 0; + while ( i < coverage ) { + if (positions.add(new Integer(generator.nextInt(reads.size())))) + i++; + } + + ArrayList downsampledReads = new ArrayList(); + Iterator positionIter = positions.iterator(); + Iterator readsIter = reads.iterator(); + int currentRead = 0; + while ( positionIter.hasNext() ) { + int nextReadToKeep = (Integer)positionIter.next(); + + // fast-forward to the right read + while ( currentRead < nextReadToKeep ) { + readsIter.next(); + currentRead++; + } + + downsampledReads.add(readsIter.next()); + currentRead++; + } + + reads = downsampledReads; + } } diff --git a/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java b/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java index 22fe74310..da7b0ef84 100755 --- a/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java +++ b/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java @@ -80,10 +80,12 @@ public abstract class TraversalEngine { protected boolean beSafeP = true; protected boolean SORT_ON_FLY = false; protected boolean DOWNSAMPLE_BY_FRACTION = false; + protected boolean DOWNSAMPLE_BY_COVERAGE = false; protected boolean FILTER_UNSORTED_READS = false; protected boolean walkOverAllSites = false; protected int maxOnFlySorts = 100000; protected double downsamplingFraction = 1.0; + protected int downsamplingCoverage = 0; protected long N_RECORDS_TO_PRINT = 100000; protected boolean THREADED_IO = false; protected int THREADED_IO_BUFFER_SIZE = 10000; @@ -169,6 +171,12 @@ public abstract class TraversalEngine { downsamplingFraction = fraction; } + public void setDownsampleByCoverage(final int coverage) { + logger.info("Downsampling to coverage " + coverage); + DOWNSAMPLE_BY_COVERAGE = true; + downsamplingCoverage = coverage; + } + // -------------------------------------------------------------------------------------------------------------- // // functions for dealing locations (areas of the genome we're traversing over) diff --git a/java/src/org/broadinstitute/sting/gatk/traversals/TraverseByLoci.java b/java/src/org/broadinstitute/sting/gatk/traversals/TraverseByLoci.java index 3c74e00da..33659b58c 100644 --- a/java/src/org/broadinstitute/sting/gatk/traversals/TraverseByLoci.java +++ b/java/src/org/broadinstitute/sting/gatk/traversals/TraverseByLoci.java @@ -110,6 +110,8 @@ public class TraverseByLoci extends TraversalEngine { // actually get the read and hand it to the walker LocusContext locus = iter.next(); + if ( DOWNSAMPLE_BY_COVERAGE ) + locus.downsampleToCoverage(downsamplingCoverage); // if we don't have a particular interval we're processing, check them all, otherwise only operate at this // location diff --git a/java/src/org/broadinstitute/sting/gatk/traversals/TraverseByLociByReference.java b/java/src/org/broadinstitute/sting/gatk/traversals/TraverseByLociByReference.java index f592165f0..654d824a6 100644 --- a/java/src/org/broadinstitute/sting/gatk/traversals/TraverseByLociByReference.java +++ b/java/src/org/broadinstitute/sting/gatk/traversals/TraverseByLociByReference.java @@ -77,6 +77,8 @@ public class TraverseByLociByReference extends TraverseByLoci { locus = new LocusContext(current, NO_READS, NO_OFFSETS); // make the empty locus that has no reads locus.setReferenceContig(refSite.getCurrentContig()); + if ( DOWNSAMPLE_BY_COVERAGE ) + locus.downsampleToCoverage(downsamplingCoverage); sum = walkAtLocus( walker, sum, locus, refSite, rodData ); if (this.maxReads > 0 && this.nRecords > this.maxReads) {