Added iterator to randomly downsample to a given fraction of the reads.

Also, updated sort iterator to allow user to input max sorts.
Put in placeholder for downsampling to given coverage.


git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@243 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
ebanks 2009-04-01 02:11:13 +00:00
parent 385736469c
commit 3af4290a49
4 changed files with 88 additions and 12 deletions

View File

@ -44,7 +44,9 @@ public class GenomeAnalysisTK extends CommandLineProgram {
public String HAPMAP_FILE = null;
public Boolean ENABLED_THREADED_IO = false;
public Boolean UNSAFE = false;
public Boolean ENABLED_SORT_ON_FLY = false;
public String MAX_ON_FLY_SORTS = null;
public String DOWNSAMPLE_FRACTION = null;
public String DOWNSAMPLE_COVERAGE = null;
public String INTERVALS_FILE = null;
// our walker manager
@ -104,7 +106,9 @@ public class GenomeAnalysisTK extends CommandLineProgram {
m_parser.addOptionalArg("Hapmap", "H", "Hapmap file", "HAPMAP_FILE");
m_parser.addOptionalFlag("threaded_IO", "P", "If set, enables threaded I/O operations", "ENABLED_THREADED_IO");
m_parser.addOptionalFlag("unsafe", "U", "If set, enables unsafe operations, nothing will be checked at runtime.", "UNSAFE");
m_parser.addOptionalFlag("sort_on_the_fly", "F", "If set, enables on fly sorting of reads file.", "ENABLED_SORT_ON_FLY");
m_parser.addOptionalArg("sort_on_the_fly", "sort", "Maximum number of reads to sort on the fly", "MAX_ON_FLY_SORTS");
m_parser.addOptionalArg("downsample_to_fraction", "dfrac", "Fraction of reads to downsample to", "DOWNSAMPLE_FRACTION");
m_parser.addOptionalArg("downsample_to_coverage", "dcov", "Coverage to downsample to", "DOWNSAMPLE_COVERAGE");
m_parser.addOptionalArg("intervals_file", "V", "File containing list of genomic intervals to operate on. line := <contig> <start> <end>", "INTERVALS_FILE");
m_parser.addOptionalArg("all_loci", "A", "Should we process all loci, not just those covered by reads", "WALK_ALL_LOCI");
m_parser.addOptionalArg("out", "o", "An output file presented to the walker. Will overwrite contents if file exists.", "outFileName" );
@ -228,8 +232,19 @@ public class GenomeAnalysisTK extends CommandLineProgram {
engine.setLocationFromFile(INTERVALS_FILE);
}
if (MAX_ON_FLY_SORTS != null) {
engine.setSortOnFly(Integer.parseInt(MAX_ON_FLY_SORTS));
}
if (DOWNSAMPLE_FRACTION != null) {
engine.setDownsampleByFraction(Double.parseDouble(DOWNSAMPLE_FRACTION));
}
if (DOWNSAMPLE_COVERAGE != null) {
; // TO DO: (Double.parseDouble(DOWNSAMPLE_COVERAGE));
}
engine.setSafetyChecking(!UNSAFE);
engine.setSortOnFly(ENABLED_SORT_ON_FLY);
engine.setThreadedIO(ENABLED_THREADED_IO);
engine.setWalkOverAllSites(WALK_ALL_LOCI);
engine.initialize();

View File

@ -0,0 +1,49 @@
package org.broadinstitute.sting.gatk.iterators;
import net.sf.samtools.SAMRecord;
import java.util.Random;
import java.util.Iterator;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.Utils;
public class DownsampleIterator implements Iterator<SAMRecord> {
Iterator<SAMRecord> it;
Random generator;
int cutoff;
SAMRecord next;
public DownsampleIterator(Iterator<SAMRecord> it, double fraction) {
this.it = it;
generator = new Random();
cutoff = (int)(fraction * 10000);
next = getNextRecord();
}
public boolean hasNext() {
return next != null;
}
public SAMRecord next() {
SAMRecord result = next;
next = getNextRecord();
return result;
}
public void remove() {
throw new UnsupportedOperationException("Can not remove records from a SAM file via an iterator!");
}
private SAMRecord getNextRecord() {
while ( true ) {
if ( !it.hasNext() )
return null;
SAMRecord rec = it.next();
if ( generator.nextInt(10000) < cutoff )
return rec;
}
}
}

View File

@ -1,10 +1,7 @@
package org.broadinstitute.sting.gatk.iterators;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.SAMFileReader;
import net.sf.samtools.util.RuntimeIOException;
import java.util.Arrays;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Iterator;

View File

@ -79,9 +79,11 @@ public abstract class TraversalEngine {
protected boolean DEBUGGING = false;
protected boolean beSafeP = true;
protected boolean SORT_ON_FLY = false;
protected boolean DOWNSAMPLE_BY_FRACTION = false;
protected boolean FILTER_UNSORTED_READS = false;
protected boolean walkOverAllSites = false;
protected int MAX_ON_FLY_SORTS = 100000;
protected int maxOnFlySorts = 100000;
protected double downsamplingFraction = 1.0;
protected long N_RECORDS_TO_PRINT = 100000;
protected boolean THREADED_IO = false;
protected int THREADED_IO_BUFFER_SIZE = 10000;
@ -153,10 +155,18 @@ public abstract class TraversalEngine {
this.FILTER_UNSORTED_READS = filterUnsorted;
}
public void setSortOnFly(final boolean SORT_ON_FLY) {
if (SORT_ON_FLY)
logger.info("Sorting read file on the fly: max reads allowed is " + MAX_ON_FLY_SORTS);
this.SORT_ON_FLY = SORT_ON_FLY;
public void setSortOnFly(final int maxReadsToSort) {
logger.info("Sorting read file on the fly: max reads allowed is " + maxReadsToSort);
SORT_ON_FLY = true;
maxOnFlySorts = maxReadsToSort;
}
public void setSortOnFly() { setSortOnFly(100000); }
public void setDownsampleByFraction(final double fraction) {
logger.info("Downsampling to approximately " + (fraction * 100.0) + "% of filtered reads");
DOWNSAMPLE_BY_FRACTION = true;
downsamplingFraction = fraction;
}
// --------------------------------------------------------------------------------------------------------------
@ -337,8 +347,13 @@ public abstract class TraversalEngine {
protected Iterator<SAMRecord> WrapReadsIterator( final Iterator<SAMRecord> rawIterator, final boolean enableVerification ) {
Iterator<SAMRecord> wrappedIterator = rawIterator;
// NOTE: this (and other filtering) should be done before on-the-fly sorting
// as there is no reason to sort something that we will end of throwing away
if (DOWNSAMPLE_BY_FRACTION)
wrappedIterator = new DownsampleIterator(wrappedIterator, downsamplingFraction);
if (SORT_ON_FLY)
wrappedIterator = new SortSamIterator(wrappedIterator, MAX_ON_FLY_SORTS);
wrappedIterator = new SortSamIterator(wrappedIterator, maxOnFlySorts);
if (beSafeP && enableVerification)
wrappedIterator = new VerifyingSamIterator(wrappedIterator);