Added ability to downsample to a particular coverage
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@250 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
bb3dbb5756
commit
6cc2fa24d5
|
|
@ -107,8 +107,8 @@ public class GenomeAnalysisTK extends CommandLineProgram {
|
|||
m_parser.addOptionalFlag("threaded_IO", "P", "If set, enables threaded I/O operations", "ENABLED_THREADED_IO");
|
||||
m_parser.addOptionalFlag("unsafe", "U", "If set, enables unsafe operations, nothing will be checked at runtime.", "UNSAFE");
|
||||
m_parser.addOptionalArg("sort_on_the_fly", "sort", "Maximum number of reads to sort on the fly", "MAX_ON_FLY_SORTS");
|
||||
m_parser.addOptionalArg("downsample_to_fraction", "dfrac", "Fraction of reads to downsample to", "DOWNSAMPLE_FRACTION");
|
||||
m_parser.addOptionalArg("downsample_to_coverage", "dcov", "Coverage to downsample to", "DOWNSAMPLE_COVERAGE");
|
||||
m_parser.addOptionalArg("downsample_to_fraction", "dfrac", "Fraction [0.0-1.0] of reads to downsample to", "DOWNSAMPLE_FRACTION");
|
||||
m_parser.addOptionalArg("downsample_to_coverage", "dcov", "Coverage [integer] to downsample to", "DOWNSAMPLE_COVERAGE");
|
||||
m_parser.addOptionalArg("intervals_file", "V", "File containing list of genomic intervals to operate on. line := <contig> <start> <end>", "INTERVALS_FILE");
|
||||
m_parser.addOptionalArg("all_loci", "A", "Should we process all loci, not just those covered by reads", "WALK_ALL_LOCI");
|
||||
m_parser.addOptionalArg("out", "o", "An output file presented to the walker. Will overwrite contents if file exists.", "outFileName" );
|
||||
|
|
@ -241,7 +241,7 @@ public class GenomeAnalysisTK extends CommandLineProgram {
|
|||
}
|
||||
|
||||
if (DOWNSAMPLE_COVERAGE != null) {
|
||||
; // TO DO: (Double.parseDouble(DOWNSAMPLE_COVERAGE));
|
||||
engine.setDownsampleByCoverage(Integer.parseInt(DOWNSAMPLE_COVERAGE));
|
||||
}
|
||||
|
||||
engine.setSafetyChecking(!UNSAFE);
|
||||
|
|
|
|||
|
|
@ -3,7 +3,10 @@ package org.broadinstitute.sting.gatk;
|
|||
import net.sf.samtools.SAMRecord;
|
||||
|
||||
import java.util.List;
|
||||
import java.lang.ref.Reference;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Iterator;
|
||||
import java.util.TreeSet;
|
||||
import java.util.Random;
|
||||
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import edu.mit.broad.picard.reference.ReferenceSequence;
|
||||
|
|
@ -98,4 +101,37 @@ public class LocusContext {
|
|||
public void setReferenceContig(final ReferenceSequence contig) {
|
||||
refContig = contig;
|
||||
}
|
||||
|
||||
public void downsampleToCoverage(int coverage) {
|
||||
if ( numReads() <= coverage )
|
||||
return;
|
||||
|
||||
// randomly choose numbers corresponding to positions in the reads list
|
||||
Random generator = new Random();
|
||||
TreeSet positions = new TreeSet();
|
||||
int i = 0;
|
||||
while ( i < coverage ) {
|
||||
if (positions.add(new Integer(generator.nextInt(reads.size()))))
|
||||
i++;
|
||||
}
|
||||
|
||||
ArrayList downsampledReads = new ArrayList();
|
||||
Iterator positionIter = positions.iterator();
|
||||
Iterator readsIter = reads.iterator();
|
||||
int currentRead = 0;
|
||||
while ( positionIter.hasNext() ) {
|
||||
int nextReadToKeep = (Integer)positionIter.next();
|
||||
|
||||
// fast-forward to the right read
|
||||
while ( currentRead < nextReadToKeep ) {
|
||||
readsIter.next();
|
||||
currentRead++;
|
||||
}
|
||||
|
||||
downsampledReads.add(readsIter.next());
|
||||
currentRead++;
|
||||
}
|
||||
|
||||
reads = downsampledReads;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -80,10 +80,12 @@ public abstract class TraversalEngine {
|
|||
protected boolean beSafeP = true;
|
||||
protected boolean SORT_ON_FLY = false;
|
||||
protected boolean DOWNSAMPLE_BY_FRACTION = false;
|
||||
protected boolean DOWNSAMPLE_BY_COVERAGE = false;
|
||||
protected boolean FILTER_UNSORTED_READS = false;
|
||||
protected boolean walkOverAllSites = false;
|
||||
protected int maxOnFlySorts = 100000;
|
||||
protected double downsamplingFraction = 1.0;
|
||||
protected int downsamplingCoverage = 0;
|
||||
protected long N_RECORDS_TO_PRINT = 100000;
|
||||
protected boolean THREADED_IO = false;
|
||||
protected int THREADED_IO_BUFFER_SIZE = 10000;
|
||||
|
|
@ -169,6 +171,12 @@ public abstract class TraversalEngine {
|
|||
downsamplingFraction = fraction;
|
||||
}
|
||||
|
||||
public void setDownsampleByCoverage(final int coverage) {
|
||||
logger.info("Downsampling to coverage " + coverage);
|
||||
DOWNSAMPLE_BY_COVERAGE = true;
|
||||
downsamplingCoverage = coverage;
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// functions for dealing locations (areas of the genome we're traversing over)
|
||||
|
|
|
|||
|
|
@ -110,6 +110,8 @@ public class TraverseByLoci extends TraversalEngine {
|
|||
|
||||
// actually get the read and hand it to the walker
|
||||
LocusContext locus = iter.next();
|
||||
if ( DOWNSAMPLE_BY_COVERAGE )
|
||||
locus.downsampleToCoverage(downsamplingCoverage);
|
||||
|
||||
// if we don't have a particular interval we're processing, check them all, otherwise only operate at this
|
||||
// location
|
||||
|
|
|
|||
|
|
@ -77,6 +77,8 @@ public class TraverseByLociByReference extends TraverseByLoci {
|
|||
locus = new LocusContext(current, NO_READS, NO_OFFSETS); // make the empty locus that has no reads
|
||||
|
||||
locus.setReferenceContig(refSite.getCurrentContig());
|
||||
if ( DOWNSAMPLE_BY_COVERAGE )
|
||||
locus.downsampleToCoverage(downsamplingCoverage);
|
||||
sum = walkAtLocus( walker, sum, locus, refSite, rodData );
|
||||
|
||||
if (this.maxReads > 0 && this.nRecords > this.maxReads) {
|
||||
|
|
|
|||
Loading…
Reference in New Issue