Added ability to downsample to a particular coverage
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@250 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
bb3dbb5756
commit
6cc2fa24d5
|
|
@ -107,8 +107,8 @@ public class GenomeAnalysisTK extends CommandLineProgram {
|
||||||
m_parser.addOptionalFlag("threaded_IO", "P", "If set, enables threaded I/O operations", "ENABLED_THREADED_IO");
|
m_parser.addOptionalFlag("threaded_IO", "P", "If set, enables threaded I/O operations", "ENABLED_THREADED_IO");
|
||||||
m_parser.addOptionalFlag("unsafe", "U", "If set, enables unsafe operations, nothing will be checked at runtime.", "UNSAFE");
|
m_parser.addOptionalFlag("unsafe", "U", "If set, enables unsafe operations, nothing will be checked at runtime.", "UNSAFE");
|
||||||
m_parser.addOptionalArg("sort_on_the_fly", "sort", "Maximum number of reads to sort on the fly", "MAX_ON_FLY_SORTS");
|
m_parser.addOptionalArg("sort_on_the_fly", "sort", "Maximum number of reads to sort on the fly", "MAX_ON_FLY_SORTS");
|
||||||
m_parser.addOptionalArg("downsample_to_fraction", "dfrac", "Fraction of reads to downsample to", "DOWNSAMPLE_FRACTION");
|
m_parser.addOptionalArg("downsample_to_fraction", "dfrac", "Fraction [0.0-1.0] of reads to downsample to", "DOWNSAMPLE_FRACTION");
|
||||||
m_parser.addOptionalArg("downsample_to_coverage", "dcov", "Coverage to downsample to", "DOWNSAMPLE_COVERAGE");
|
m_parser.addOptionalArg("downsample_to_coverage", "dcov", "Coverage [integer] to downsample to", "DOWNSAMPLE_COVERAGE");
|
||||||
m_parser.addOptionalArg("intervals_file", "V", "File containing list of genomic intervals to operate on. line := <contig> <start> <end>", "INTERVALS_FILE");
|
m_parser.addOptionalArg("intervals_file", "V", "File containing list of genomic intervals to operate on. line := <contig> <start> <end>", "INTERVALS_FILE");
|
||||||
m_parser.addOptionalArg("all_loci", "A", "Should we process all loci, not just those covered by reads", "WALK_ALL_LOCI");
|
m_parser.addOptionalArg("all_loci", "A", "Should we process all loci, not just those covered by reads", "WALK_ALL_LOCI");
|
||||||
m_parser.addOptionalArg("out", "o", "An output file presented to the walker. Will overwrite contents if file exists.", "outFileName" );
|
m_parser.addOptionalArg("out", "o", "An output file presented to the walker. Will overwrite contents if file exists.", "outFileName" );
|
||||||
|
|
@ -241,7 +241,7 @@ public class GenomeAnalysisTK extends CommandLineProgram {
|
||||||
}
|
}
|
||||||
|
|
||||||
if (DOWNSAMPLE_COVERAGE != null) {
|
if (DOWNSAMPLE_COVERAGE != null) {
|
||||||
; // TO DO: (Double.parseDouble(DOWNSAMPLE_COVERAGE));
|
engine.setDownsampleByCoverage(Integer.parseInt(DOWNSAMPLE_COVERAGE));
|
||||||
}
|
}
|
||||||
|
|
||||||
engine.setSafetyChecking(!UNSAFE);
|
engine.setSafetyChecking(!UNSAFE);
|
||||||
|
|
|
||||||
|
|
@ -3,7 +3,10 @@ package org.broadinstitute.sting.gatk;
|
||||||
import net.sf.samtools.SAMRecord;
|
import net.sf.samtools.SAMRecord;
|
||||||
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.lang.ref.Reference;
|
import java.util.ArrayList;
|
||||||
|
import java.util.Iterator;
|
||||||
|
import java.util.TreeSet;
|
||||||
|
import java.util.Random;
|
||||||
|
|
||||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
import edu.mit.broad.picard.reference.ReferenceSequence;
|
import edu.mit.broad.picard.reference.ReferenceSequence;
|
||||||
|
|
@ -98,4 +101,37 @@ public class LocusContext {
|
||||||
public void setReferenceContig(final ReferenceSequence contig) {
|
public void setReferenceContig(final ReferenceSequence contig) {
|
||||||
refContig = contig;
|
refContig = contig;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void downsampleToCoverage(int coverage) {
|
||||||
|
if ( numReads() <= coverage )
|
||||||
|
return;
|
||||||
|
|
||||||
|
// randomly choose numbers corresponding to positions in the reads list
|
||||||
|
Random generator = new Random();
|
||||||
|
TreeSet positions = new TreeSet();
|
||||||
|
int i = 0;
|
||||||
|
while ( i < coverage ) {
|
||||||
|
if (positions.add(new Integer(generator.nextInt(reads.size()))))
|
||||||
|
i++;
|
||||||
|
}
|
||||||
|
|
||||||
|
ArrayList downsampledReads = new ArrayList();
|
||||||
|
Iterator positionIter = positions.iterator();
|
||||||
|
Iterator readsIter = reads.iterator();
|
||||||
|
int currentRead = 0;
|
||||||
|
while ( positionIter.hasNext() ) {
|
||||||
|
int nextReadToKeep = (Integer)positionIter.next();
|
||||||
|
|
||||||
|
// fast-forward to the right read
|
||||||
|
while ( currentRead < nextReadToKeep ) {
|
||||||
|
readsIter.next();
|
||||||
|
currentRead++;
|
||||||
|
}
|
||||||
|
|
||||||
|
downsampledReads.add(readsIter.next());
|
||||||
|
currentRead++;
|
||||||
|
}
|
||||||
|
|
||||||
|
reads = downsampledReads;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -80,10 +80,12 @@ public abstract class TraversalEngine {
|
||||||
protected boolean beSafeP = true;
|
protected boolean beSafeP = true;
|
||||||
protected boolean SORT_ON_FLY = false;
|
protected boolean SORT_ON_FLY = false;
|
||||||
protected boolean DOWNSAMPLE_BY_FRACTION = false;
|
protected boolean DOWNSAMPLE_BY_FRACTION = false;
|
||||||
|
protected boolean DOWNSAMPLE_BY_COVERAGE = false;
|
||||||
protected boolean FILTER_UNSORTED_READS = false;
|
protected boolean FILTER_UNSORTED_READS = false;
|
||||||
protected boolean walkOverAllSites = false;
|
protected boolean walkOverAllSites = false;
|
||||||
protected int maxOnFlySorts = 100000;
|
protected int maxOnFlySorts = 100000;
|
||||||
protected double downsamplingFraction = 1.0;
|
protected double downsamplingFraction = 1.0;
|
||||||
|
protected int downsamplingCoverage = 0;
|
||||||
protected long N_RECORDS_TO_PRINT = 100000;
|
protected long N_RECORDS_TO_PRINT = 100000;
|
||||||
protected boolean THREADED_IO = false;
|
protected boolean THREADED_IO = false;
|
||||||
protected int THREADED_IO_BUFFER_SIZE = 10000;
|
protected int THREADED_IO_BUFFER_SIZE = 10000;
|
||||||
|
|
@ -169,6 +171,12 @@ public abstract class TraversalEngine {
|
||||||
downsamplingFraction = fraction;
|
downsamplingFraction = fraction;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void setDownsampleByCoverage(final int coverage) {
|
||||||
|
logger.info("Downsampling to coverage " + coverage);
|
||||||
|
DOWNSAMPLE_BY_COVERAGE = true;
|
||||||
|
downsamplingCoverage = coverage;
|
||||||
|
}
|
||||||
|
|
||||||
// --------------------------------------------------------------------------------------------------------------
|
// --------------------------------------------------------------------------------------------------------------
|
||||||
//
|
//
|
||||||
// functions for dealing locations (areas of the genome we're traversing over)
|
// functions for dealing locations (areas of the genome we're traversing over)
|
||||||
|
|
|
||||||
|
|
@ -110,6 +110,8 @@ public class TraverseByLoci extends TraversalEngine {
|
||||||
|
|
||||||
// actually get the read and hand it to the walker
|
// actually get the read and hand it to the walker
|
||||||
LocusContext locus = iter.next();
|
LocusContext locus = iter.next();
|
||||||
|
if ( DOWNSAMPLE_BY_COVERAGE )
|
||||||
|
locus.downsampleToCoverage(downsamplingCoverage);
|
||||||
|
|
||||||
// if we don't have a particular interval we're processing, check them all, otherwise only operate at this
|
// if we don't have a particular interval we're processing, check them all, otherwise only operate at this
|
||||||
// location
|
// location
|
||||||
|
|
|
||||||
|
|
@ -77,6 +77,8 @@ public class TraverseByLociByReference extends TraverseByLoci {
|
||||||
locus = new LocusContext(current, NO_READS, NO_OFFSETS); // make the empty locus that has no reads
|
locus = new LocusContext(current, NO_READS, NO_OFFSETS); // make the empty locus that has no reads
|
||||||
|
|
||||||
locus.setReferenceContig(refSite.getCurrentContig());
|
locus.setReferenceContig(refSite.getCurrentContig());
|
||||||
|
if ( DOWNSAMPLE_BY_COVERAGE )
|
||||||
|
locus.downsampleToCoverage(downsamplingCoverage);
|
||||||
sum = walkAtLocus( walker, sum, locus, refSite, rodData );
|
sum = walkAtLocus( walker, sum, locus, refSite, rodData );
|
||||||
|
|
||||||
if (this.maxReads > 0 && this.nRecords > this.maxReads) {
|
if (this.maxReads > 0 && this.nRecords > this.maxReads) {
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue