Added ability to downsample to a particular coverage

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@250 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
ebanks 2009-04-01 20:27:06 +00:00
parent bb3dbb5756
commit 6cc2fa24d5
5 changed files with 52 additions and 4 deletions

View File

@ -107,8 +107,8 @@ public class GenomeAnalysisTK extends CommandLineProgram {
m_parser.addOptionalFlag("threaded_IO", "P", "If set, enables threaded I/O operations", "ENABLED_THREADED_IO");
m_parser.addOptionalFlag("unsafe", "U", "If set, enables unsafe operations, nothing will be checked at runtime.", "UNSAFE");
m_parser.addOptionalArg("sort_on_the_fly", "sort", "Maximum number of reads to sort on the fly", "MAX_ON_FLY_SORTS");
m_parser.addOptionalArg("downsample_to_fraction", "dfrac", "Fraction of reads to downsample to", "DOWNSAMPLE_FRACTION");
m_parser.addOptionalArg("downsample_to_coverage", "dcov", "Coverage to downsample to", "DOWNSAMPLE_COVERAGE");
m_parser.addOptionalArg("downsample_to_fraction", "dfrac", "Fraction [0.0-1.0] of reads to downsample to", "DOWNSAMPLE_FRACTION");
m_parser.addOptionalArg("downsample_to_coverage", "dcov", "Coverage [integer] to downsample to", "DOWNSAMPLE_COVERAGE");
m_parser.addOptionalArg("intervals_file", "V", "File containing list of genomic intervals to operate on. line := <contig> <start> <end>", "INTERVALS_FILE");
m_parser.addOptionalArg("all_loci", "A", "Should we process all loci, not just those covered by reads", "WALK_ALL_LOCI");
m_parser.addOptionalArg("out", "o", "An output file presented to the walker. Will overwrite contents if file exists.", "outFileName" );
@ -241,7 +241,7 @@ public class GenomeAnalysisTK extends CommandLineProgram {
}
if (DOWNSAMPLE_COVERAGE != null) {
; // TO DO: (Double.parseDouble(DOWNSAMPLE_COVERAGE));
engine.setDownsampleByCoverage(Integer.parseInt(DOWNSAMPLE_COVERAGE));
}
engine.setSafetyChecking(!UNSAFE);

View File

@ -3,7 +3,10 @@ package org.broadinstitute.sting.gatk;
import net.sf.samtools.SAMRecord;
import java.util.List;
import java.lang.ref.Reference;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.TreeSet;
import java.util.Random;
import org.broadinstitute.sting.utils.GenomeLoc;
import edu.mit.broad.picard.reference.ReferenceSequence;
@ -98,4 +101,37 @@ public class LocusContext {
public void setReferenceContig(final ReferenceSequence contig) {
refContig = contig;
}
public void downsampleToCoverage(int coverage) {
if ( numReads() <= coverage )
return;
// randomly choose numbers corresponding to positions in the reads list
Random generator = new Random();
TreeSet positions = new TreeSet();
int i = 0;
while ( i < coverage ) {
if (positions.add(new Integer(generator.nextInt(reads.size()))))
i++;
}
ArrayList downsampledReads = new ArrayList();
Iterator positionIter = positions.iterator();
Iterator readsIter = reads.iterator();
int currentRead = 0;
while ( positionIter.hasNext() ) {
int nextReadToKeep = (Integer)positionIter.next();
// fast-forward to the right read
while ( currentRead < nextReadToKeep ) {
readsIter.next();
currentRead++;
}
downsampledReads.add(readsIter.next());
currentRead++;
}
reads = downsampledReads;
}
}

View File

@ -80,10 +80,12 @@ public abstract class TraversalEngine {
protected boolean beSafeP = true;
protected boolean SORT_ON_FLY = false;
protected boolean DOWNSAMPLE_BY_FRACTION = false;
protected boolean DOWNSAMPLE_BY_COVERAGE = false;
protected boolean FILTER_UNSORTED_READS = false;
protected boolean walkOverAllSites = false;
protected int maxOnFlySorts = 100000;
protected double downsamplingFraction = 1.0;
protected int downsamplingCoverage = 0;
protected long N_RECORDS_TO_PRINT = 100000;
protected boolean THREADED_IO = false;
protected int THREADED_IO_BUFFER_SIZE = 10000;
@ -169,6 +171,12 @@ public abstract class TraversalEngine {
downsamplingFraction = fraction;
}
public void setDownsampleByCoverage(final int coverage) {
logger.info("Downsampling to coverage " + coverage);
DOWNSAMPLE_BY_COVERAGE = true;
downsamplingCoverage = coverage;
}
// --------------------------------------------------------------------------------------------------------------
//
// functions for dealing locations (areas of the genome we're traversing over)

View File

@ -110,6 +110,8 @@ public class TraverseByLoci extends TraversalEngine {
// actually get the read and hand it to the walker
LocusContext locus = iter.next();
if ( DOWNSAMPLE_BY_COVERAGE )
locus.downsampleToCoverage(downsamplingCoverage);
// if we don't have a particular interval we're processing, check them all, otherwise only operate at this
// location

View File

@ -77,6 +77,8 @@ public class TraverseByLociByReference extends TraverseByLoci {
locus = new LocusContext(current, NO_READS, NO_OFFSETS); // make the empty locus that has no reads
locus.setReferenceContig(refSite.getCurrentContig());
if ( DOWNSAMPLE_BY_COVERAGE )
locus.downsampleToCoverage(downsamplingCoverage);
sum = walkAtLocus( walker, sum, locus, refSite, rodData );
if (this.maxReads > 0 && this.nRecords > this.maxReads) {