Added iterator to randomly downsample to a given fraction of the reads.
Also, updated sort iterator to allow user to input max sorts. Put in placeholder for downsampling to given coverage. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@243 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
385736469c
commit
3af4290a49
|
|
@ -44,7 +44,9 @@ public class GenomeAnalysisTK extends CommandLineProgram {
|
|||
public String HAPMAP_FILE = null;
|
||||
public Boolean ENABLED_THREADED_IO = false;
|
||||
public Boolean UNSAFE = false;
|
||||
public Boolean ENABLED_SORT_ON_FLY = false;
|
||||
public String MAX_ON_FLY_SORTS = null;
|
||||
public String DOWNSAMPLE_FRACTION = null;
|
||||
public String DOWNSAMPLE_COVERAGE = null;
|
||||
public String INTERVALS_FILE = null;
|
||||
|
||||
// our walker manager
|
||||
|
|
@ -104,7 +106,9 @@ public class GenomeAnalysisTK extends CommandLineProgram {
|
|||
m_parser.addOptionalArg("Hapmap", "H", "Hapmap file", "HAPMAP_FILE");
|
||||
m_parser.addOptionalFlag("threaded_IO", "P", "If set, enables threaded I/O operations", "ENABLED_THREADED_IO");
|
||||
m_parser.addOptionalFlag("unsafe", "U", "If set, enables unsafe operations, nothing will be checked at runtime.", "UNSAFE");
|
||||
m_parser.addOptionalFlag("sort_on_the_fly", "F", "If set, enables on fly sorting of reads file.", "ENABLED_SORT_ON_FLY");
|
||||
m_parser.addOptionalArg("sort_on_the_fly", "sort", "Maximum number of reads to sort on the fly", "MAX_ON_FLY_SORTS");
|
||||
m_parser.addOptionalArg("downsample_to_fraction", "dfrac", "Fraction of reads to downsample to", "DOWNSAMPLE_FRACTION");
|
||||
m_parser.addOptionalArg("downsample_to_coverage", "dcov", "Coverage to downsample to", "DOWNSAMPLE_COVERAGE");
|
||||
m_parser.addOptionalArg("intervals_file", "V", "File containing list of genomic intervals to operate on. line := <contig> <start> <end>", "INTERVALS_FILE");
|
||||
m_parser.addOptionalArg("all_loci", "A", "Should we process all loci, not just those covered by reads", "WALK_ALL_LOCI");
|
||||
m_parser.addOptionalArg("out", "o", "An output file presented to the walker. Will overwrite contents if file exists.", "outFileName" );
|
||||
|
|
@ -228,8 +232,19 @@ public class GenomeAnalysisTK extends CommandLineProgram {
|
|||
engine.setLocationFromFile(INTERVALS_FILE);
|
||||
}
|
||||
|
||||
if (MAX_ON_FLY_SORTS != null) {
|
||||
engine.setSortOnFly(Integer.parseInt(MAX_ON_FLY_SORTS));
|
||||
}
|
||||
|
||||
if (DOWNSAMPLE_FRACTION != null) {
|
||||
engine.setDownsampleByFraction(Double.parseDouble(DOWNSAMPLE_FRACTION));
|
||||
}
|
||||
|
||||
if (DOWNSAMPLE_COVERAGE != null) {
|
||||
; // TO DO: (Double.parseDouble(DOWNSAMPLE_COVERAGE));
|
||||
}
|
||||
|
||||
engine.setSafetyChecking(!UNSAFE);
|
||||
engine.setSortOnFly(ENABLED_SORT_ON_FLY);
|
||||
engine.setThreadedIO(ENABLED_THREADED_IO);
|
||||
engine.setWalkOverAllSites(WALK_ALL_LOCI);
|
||||
engine.initialize();
|
||||
|
|
|
|||
|
|
@ -0,0 +1,49 @@
|
|||
package org.broadinstitute.sting.gatk.iterators;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
|
||||
import java.util.Random;
|
||||
import java.util.Iterator;
|
||||
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
|
||||
|
||||
public class DownsampleIterator implements Iterator<SAMRecord> {
|
||||
|
||||
Iterator<SAMRecord> it;
|
||||
Random generator;
|
||||
int cutoff;
|
||||
SAMRecord next;
|
||||
|
||||
public DownsampleIterator(Iterator<SAMRecord> it, double fraction) {
|
||||
this.it = it;
|
||||
generator = new Random();
|
||||
cutoff = (int)(fraction * 10000);
|
||||
next = getNextRecord();
|
||||
}
|
||||
|
||||
public boolean hasNext() {
|
||||
return next != null;
|
||||
}
|
||||
|
||||
public SAMRecord next() {
|
||||
SAMRecord result = next;
|
||||
next = getNextRecord();
|
||||
return result;
|
||||
}
|
||||
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException("Can not remove records from a SAM file via an iterator!");
|
||||
}
|
||||
|
||||
private SAMRecord getNextRecord() {
|
||||
while ( true ) {
|
||||
if ( !it.hasNext() )
|
||||
return null;
|
||||
SAMRecord rec = it.next();
|
||||
if ( generator.nextInt(10000) < cutoff )
|
||||
return rec;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -1,10 +1,7 @@
|
|||
package org.broadinstitute.sting.gatk.iterators;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import net.sf.samtools.SAMFileReader;
|
||||
import net.sf.samtools.util.RuntimeIOException;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.Iterator;
|
||||
|
|
|
|||
|
|
@ -79,9 +79,11 @@ public abstract class TraversalEngine {
|
|||
protected boolean DEBUGGING = false;
|
||||
protected boolean beSafeP = true;
|
||||
protected boolean SORT_ON_FLY = false;
|
||||
protected boolean DOWNSAMPLE_BY_FRACTION = false;
|
||||
protected boolean FILTER_UNSORTED_READS = false;
|
||||
protected boolean walkOverAllSites = false;
|
||||
protected int MAX_ON_FLY_SORTS = 100000;
|
||||
protected int maxOnFlySorts = 100000;
|
||||
protected double downsamplingFraction = 1.0;
|
||||
protected long N_RECORDS_TO_PRINT = 100000;
|
||||
protected boolean THREADED_IO = false;
|
||||
protected int THREADED_IO_BUFFER_SIZE = 10000;
|
||||
|
|
@ -153,10 +155,18 @@ public abstract class TraversalEngine {
|
|||
this.FILTER_UNSORTED_READS = filterUnsorted;
|
||||
}
|
||||
|
||||
public void setSortOnFly(final boolean SORT_ON_FLY) {
|
||||
if (SORT_ON_FLY)
|
||||
logger.info("Sorting read file on the fly: max reads allowed is " + MAX_ON_FLY_SORTS);
|
||||
this.SORT_ON_FLY = SORT_ON_FLY;
|
||||
public void setSortOnFly(final int maxReadsToSort) {
|
||||
logger.info("Sorting read file on the fly: max reads allowed is " + maxReadsToSort);
|
||||
SORT_ON_FLY = true;
|
||||
maxOnFlySorts = maxReadsToSort;
|
||||
}
|
||||
|
||||
public void setSortOnFly() { setSortOnFly(100000); }
|
||||
|
||||
public void setDownsampleByFraction(final double fraction) {
|
||||
logger.info("Downsampling to approximately " + (fraction * 100.0) + "% of filtered reads");
|
||||
DOWNSAMPLE_BY_FRACTION = true;
|
||||
downsamplingFraction = fraction;
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
|
|
@ -337,8 +347,13 @@ public abstract class TraversalEngine {
|
|||
protected Iterator<SAMRecord> WrapReadsIterator( final Iterator<SAMRecord> rawIterator, final boolean enableVerification ) {
|
||||
Iterator<SAMRecord> wrappedIterator = rawIterator;
|
||||
|
||||
// NOTE: this (and other filtering) should be done before on-the-fly sorting
|
||||
// as there is no reason to sort something that we will end of throwing away
|
||||
if (DOWNSAMPLE_BY_FRACTION)
|
||||
wrappedIterator = new DownsampleIterator(wrappedIterator, downsamplingFraction);
|
||||
|
||||
if (SORT_ON_FLY)
|
||||
wrappedIterator = new SortSamIterator(wrappedIterator, MAX_ON_FLY_SORTS);
|
||||
wrappedIterator = new SortSamIterator(wrappedIterator, maxOnFlySorts);
|
||||
|
||||
if (beSafeP && enableVerification)
|
||||
wrappedIterator = new VerifyingSamIterator(wrappedIterator);
|
||||
|
|
|
|||
Loading…
Reference in New Issue