Experimental versions of downsampler and Ryan's deduper are now available either

as walker attributes or from the command-line.  Not ready yet!  Downsampling/deduping 
works in a general sense, but this approach has not been completely optimized or validated.
Use with caution.


git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3392 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
hanna 2010-05-19 05:40:05 +00:00
parent 46ba88018d
commit 017ab6b690
14 changed files with 194 additions and 106 deletions

View File

@ -0,0 +1,15 @@
package org.broadinstitute.sting.gatk;
/**
* Type of downsampling method to invoke.
*
* @author hanna
* @version 0.1
*/
public enum DownsampleType {
NONE,
ALL_READS,
EXPERIMENTAL_BY_SAMPLE,
EXPERIMENTAL_NAIVE_DUPLICATE_ELIMINATOR
}

View File

@ -0,0 +1,47 @@
package org.broadinstitute.sting.gatk;
import org.broadinstitute.sting.utils.StingException;
/**
* Describes the method for downsampling reads at a given locus.
*
* @author hanna
* @version 0.1
*/
public class DownsamplingMethod {
/**
* Type of downsampling to perform.
*/
public final DownsampleType type;
/**
* Actual downsampling target is specified as an integer number of reads.
*/
public final Integer toCoverage;
/**
* Actual downsampling target is specified as a fraction of total available reads.
*/
public final Double toFraction;
public DownsamplingMethod(DownsampleType type, Integer toCoverage, Double toFraction) {
// Do some basic sanity checks on the downsampling parameters passed in.
// Can't leave toFraction and toCoverage null unless type is experimental naive duplicate eliminator.
if(type != DownsampleType.NONE && type != DownsampleType.EXPERIMENTAL_NAIVE_DUPLICATE_ELIMINATOR && toFraction == null && toCoverage == null)
throw new StingException("Must specify either toFraction or toCoverage when downsampling.");
// Fraction and coverage cannot both be specified.
if(toFraction != null && toCoverage != null)
throw new StingException("Downsampling coverage and fraction are both specified. Please choose only one.");
// Experimental by sample downsampling does not work with a fraction of reads.
if(type == DownsampleType.EXPERIMENTAL_BY_SAMPLE && toFraction != null)
throw new StingException("Cannot downsample to fraction with new EXPERIMENTAL_BY_SAMPLE method");
this.type = type;
this.toCoverage = toCoverage;
this.toFraction = toFraction;
}
}

View File

@ -524,10 +524,18 @@ public class GenomeAnalysisEngine {
* @return The reads object providing reads source info. * @return The reads object providing reads source info.
*/ */
private Reads extractSourceInfo(Walker walker, Collection<SamRecordFilter> filters, GATKArgumentCollection argCollection) { private Reads extractSourceInfo(Walker walker, Collection<SamRecordFilter> filters, GATKArgumentCollection argCollection) {
DownsamplingMethod method = null;
if(argCollection.downsamplingType != DownsampleType.NONE)
method = new DownsamplingMethod(argCollection.downsamplingType,argCollection.downsampleCoverage,argCollection.downsampleFraction);
else if(WalkerManager.getDownsamplingMethod(walker) != null)
method = WalkerManager.getDownsamplingMethod(walker);
else
method = new DownsamplingMethod(DownsampleType.NONE,null,null);
return new Reads(argCollection.samFiles, return new Reads(argCollection.samFiles,
argCollection.strictnessLevel, argCollection.strictnessLevel,
argCollection.downsampleFraction, method,
argCollection.downsampleCoverage,
new ValidationExclusion(Arrays.asList(argCollection.unsafe)), new ValidationExclusion(Arrays.asList(argCollection.unsafe)),
filters, filters,
argCollection.readMaxPileup, argCollection.readMaxPileup,

View File

@ -28,8 +28,7 @@ import java.util.Collection;
public class Reads { public class Reads {
private List<File> readsFiles = null; private List<File> readsFiles = null;
private SAMFileReader.ValidationStringency validationStringency = SAMFileReader.ValidationStringency.STRICT; private SAMFileReader.ValidationStringency validationStringency = SAMFileReader.ValidationStringency.STRICT;
private Double downsamplingFraction = null; private DownsamplingMethod downsamplingMethod = null;
private Integer downsampleToCoverage = null;
private ValidationExclusion exclusionList = null; private ValidationExclusion exclusionList = null;
private Collection<SamRecordFilter> supplementalFilters = null; private Collection<SamRecordFilter> supplementalFilters = null;
protected int maximumReadsAtLocus = Integer.MAX_VALUE; // this should always be set, so we'll default it MAX_INT protected int maximumReadsAtLocus = Integer.MAX_VALUE; // this should always be set, so we'll default it MAX_INT
@ -76,19 +75,11 @@ public class Reads {
} }
/** /**
* Get the fraction of reads to downsample. * Gets the method and parameters used when downsampling reads.
* @return Downsample fraction. * @return Downsample fraction.
*/ */
public Double getDownsamplingFraction() { public DownsamplingMethod getDownsamplingMethod() {
return downsamplingFraction; return downsamplingMethod;
}
/**
* Downsample each locus to the specified coverage.
* @return Coverage to which to downsample.
*/
public Integer getDownsampleToCoverage() {
return downsampleToCoverage;
} }
/** /**
@ -117,6 +108,7 @@ public class Reads {
*/ */
public Reads( List<File> readsFiles ) { public Reads( List<File> readsFiles ) {
this.readsFiles = readsFiles; this.readsFiles = readsFiles;
this.downsamplingMethod = new DownsamplingMethod(DownsampleType.NONE,null,null);
this.supplementalFilters = new ArrayList<SamRecordFilter>(); this.supplementalFilters = new ArrayList<SamRecordFilter>();
this.exclusionList = new ValidationExclusion(); this.exclusionList = new ValidationExclusion();
} }
@ -127,8 +119,6 @@ public class Reads {
* is package protected. * is package protected.
* @param samFiles list of reads files. * @param samFiles list of reads files.
* @param strictness Stringency of reads file parsing. * @param strictness Stringency of reads file parsing.
* @param downsampleFraction fraction of reads to downsample.
* @param downsampleCoverage downsampling per-locus.
* @param exclusionList what safety checks we're willing to let slide * @param exclusionList what safety checks we're willing to let slide
* @param supplementalFilters additional filters to dynamically apply. * @param supplementalFilters additional filters to dynamically apply.
* @param generateExtendedEvents if true, the engine will issue an extra call to walker's map() with * @param generateExtendedEvents if true, the engine will issue an extra call to walker's map() with
@ -140,8 +130,7 @@ public class Reads {
*/ */
Reads( List<File> samFiles, Reads( List<File> samFiles,
SAMFileReader.ValidationStringency strictness, SAMFileReader.ValidationStringency strictness,
Double downsampleFraction, DownsamplingMethod downsamplingMethod,
Integer downsampleCoverage,
ValidationExclusion exclusionList, ValidationExclusion exclusionList,
Collection<SamRecordFilter> supplementalFilters, Collection<SamRecordFilter> supplementalFilters,
int maximumReadsAtLocus, int maximumReadsAtLocus,
@ -149,8 +138,7 @@ public class Reads {
boolean generateExtendedEvents) { boolean generateExtendedEvents) {
this.readsFiles = samFiles; this.readsFiles = samFiles;
this.validationStringency = strictness; this.validationStringency = strictness;
this.downsamplingFraction = downsampleFraction; this.downsamplingMethod = downsamplingMethod;
this.downsampleToCoverage = downsampleCoverage;
this.exclusionList = exclusionList == null ? new ValidationExclusion() : exclusionList; this.exclusionList = exclusionList == null ? new ValidationExclusion() : exclusionList;
this.supplementalFilters = supplementalFilters; this.supplementalFilters = supplementalFilters;
this.maximumReadsAtLocus = maximumReadsAtLocus; this.maximumReadsAtLocus = maximumReadsAtLocus;

View File

@ -238,6 +238,27 @@ public class WalkerManager extends PluginManager<Walker> {
return filters; return filters;
} }
/**
* Gets the type of downsampling method requested by the walker. If an alternative
* downsampling method is specified on the command-line, the command-line version will
* be used instead.
* @param walker The walker to interrogate.
* @return The downsampling method, as specified by the walker. Null if none exists.
*/
public static DownsamplingMethod getDownsamplingMethod(Walker walker) {
DownsamplingMethod downsamplingMethod = null;
if( walker.getClass().isAnnotationPresent(Downsample.class) ) {
Downsample downsampleParameters = walker.getClass().getAnnotation(Downsample.class);
DownsampleType type = downsampleParameters.by();
Integer toCoverage = downsampleParameters.toCoverage() >= 0 ? downsampleParameters.toCoverage() : null;
Double toFraction = downsampleParameters.toFraction() >= 0.0d ? downsampleParameters.toFraction() : null;
downsamplingMethod = new DownsamplingMethod(type,toCoverage,toFraction);
}
return downsamplingMethod;
}
/** /**
* Create a name for this type of walker. * Create a name for this type of walker.
* *

View File

@ -29,6 +29,7 @@ import net.sf.samtools.SAMFileReader;
import org.broadinstitute.sting.utils.StingException; import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.interval.IntervalMergingRule; import org.broadinstitute.sting.utils.interval.IntervalMergingRule;
import org.broadinstitute.sting.commandline.Argument; import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.gatk.DownsampleType;
import org.simpleframework.xml.*; import org.simpleframework.xml.*;
import org.simpleframework.xml.core.Persister; import org.simpleframework.xml.core.Persister;
import org.simpleframework.xml.stream.Format; import org.simpleframework.xml.stream.Format;
@ -124,6 +125,10 @@ public class GATKArgumentCollection {
@Argument(fullName = "filterZeroMappingQualityReads", shortName = "fmq0", doc = "If true, mapping quality zero reads will be filtered at the lowest GATK level. Vastly improves performance at areas with abnormal depth due to mapping Q0 reads", required = false) @Argument(fullName = "filterZeroMappingQualityReads", shortName = "fmq0", doc = "If true, mapping quality zero reads will be filtered at the lowest GATK level. Vastly improves performance at areas with abnormal depth due to mapping Q0 reads", required = false)
public Boolean filterZeroMappingQualityReads = false; public Boolean filterZeroMappingQualityReads = false;
@Element(required = false)
@Argument(fullName = "downsampling_type", shortName="dt", doc="Type of reads downsampling to employ at a given locus. Reads will be selected randomly to be removed from the pile based on the method described here.", required = false)
public DownsampleType downsamplingType = DownsampleType.NONE;
@Element(required = false) @Element(required = false)
@Argument(fullName = "downsample_to_fraction", shortName = "dfrac", doc = "Fraction [0.0-1.0] of reads to downsample to", required = false) @Argument(fullName = "downsample_to_fraction", shortName = "dfrac", doc = "Fraction [0.0-1.0] of reads to downsample to", required = false)
public Double downsampleFraction = null; public Double downsampleFraction = null;
@ -148,13 +153,9 @@ public class GATKArgumentCollection {
@Argument(fullName = "max_reads_at_locus", shortName = "mrl", doc = "Sets the upper limit for the number of reads presented at a single locus; use this argument if you are running into memory issues resulting from too many reads piled up at a given locus (but use downsample_to_coverage instead if you are trying to downsample); int.MAX_VALUE by default.", required = false) @Argument(fullName = "max_reads_at_locus", shortName = "mrl", doc = "Sets the upper limit for the number of reads presented at a single locus; use this argument if you are running into memory issues resulting from too many reads piled up at a given locus (but use downsample_to_coverage instead if you are trying to downsample); int.MAX_VALUE by default.", required = false)
public int readMaxPileup = Integer.MAX_VALUE; public int readMaxPileup = Integer.MAX_VALUE;
@Element(required = false)
@Argument(fullName = "disablethreading", shortName = "dt", doc = "Disable experimental threading support.", required = false)
public Boolean disableThreading = false;
/** How many threads should be allocated to this analysis. */ /** How many threads should be allocated to this analysis. */
@Element(required = false) @Element(required = false)
@Argument(fullName = "numthreads", shortName = "nt", doc = "How many threads should be allocated to running this analysis.", required = false) @Argument(fullName = "num_threads", shortName = "nt", doc = "How many threads should be allocated to running this analysis.", required = false)
public int numberOfThreads = 1; public int numberOfThreads = 1;
/** What rule should we use when merging intervals */ /** What rule should we use when merging intervals */

View File

@ -127,8 +127,8 @@ public abstract class LocusView extends LocusIterator implements View {
// Find the next. // Find the next.
seedNextLocus(); seedNextLocus();
if( sourceInfo.getDownsampleToCoverage() != null ) if( sourceInfo.getDownsamplingMethod().toCoverage != null )
current.downsampleToCoverage( sourceInfo.getDownsampleToCoverage() ); current.downsampleToCoverage( sourceInfo.getDownsamplingMethod().toCoverage );
// if the current loci isn't null, get the overflow tracker and pass it to the alignment context // if the current loci isn't null, get the overflow tracker and pass it to the alignment context
if ((this.loci != null)) if ((this.loci != null))

View File

@ -261,7 +261,7 @@ public class BlockDrivenSAMDataSource extends SAMDataSource {
return applyDecoratingIterators(enableVerification, return applyDecoratingIterators(enableVerification,
new ReleasingIterator(readers,StingSAMIteratorAdapter.adapt(reads,mergingIterator)), new ReleasingIterator(readers,StingSAMIteratorAdapter.adapt(reads,mergingIterator)),
reads.getDownsamplingFraction(), reads.getDownsamplingMethod().toFraction,
reads.getValidationExclusionList().contains(ValidationExclusion.TYPE.NO_READ_ORDER_VERIFICATION), reads.getValidationExclusionList().contains(ValidationExclusion.TYPE.NO_READ_ORDER_VERIFICATION),
reads.getSupplementalFilters()); reads.getSupplementalFilters());
} }
@ -282,7 +282,7 @@ public class BlockDrivenSAMDataSource extends SAMDataSource {
return applyDecoratingIterators(shard instanceof ReadShard, return applyDecoratingIterators(shard instanceof ReadShard,
new ReleasingIterator(readers,StingSAMIteratorAdapter.adapt(reads,mergingIterator)), new ReleasingIterator(readers,StingSAMIteratorAdapter.adapt(reads,mergingIterator)),
reads.getDownsamplingFraction(), reads.getDownsamplingMethod().toFraction,
reads.getValidationExclusionList().contains(ValidationExclusion.TYPE.NO_READ_ORDER_VERIFICATION), reads.getValidationExclusionList().contains(ValidationExclusion.TYPE.NO_READ_ORDER_VERIFICATION),
reads.getSupplementalFilters()); reads.getSupplementalFilters());
} }

View File

@ -165,14 +165,14 @@ public class IndexDrivenSAMDataSource extends SAMDataSource {
iterator = seekRead(shard); iterator = seekRead(shard);
iterator = applyDecoratingIterators(true, iterator = applyDecoratingIterators(true,
iterator, iterator,
reads.getDownsamplingFraction(), reads.getDownsamplingMethod().toFraction,
reads.getValidationExclusionList().contains(ValidationExclusion.TYPE.NO_READ_ORDER_VERIFICATION), reads.getValidationExclusionList().contains(ValidationExclusion.TYPE.NO_READ_ORDER_VERIFICATION),
reads.getSupplementalFilters()); reads.getSupplementalFilters());
} else if (shard.getShardType() == Shard.ShardType.LOCUS) { } else if (shard.getShardType() == Shard.ShardType.LOCUS) {
iterator = seekLocus(shard); iterator = seekLocus(shard);
iterator = applyDecoratingIterators(false, iterator = applyDecoratingIterators(false,
iterator, iterator,
reads.getDownsamplingFraction(), reads.getDownsamplingMethod().toFraction,
reads.getValidationExclusionList().contains(ValidationExclusion.TYPE.NO_READ_ORDER_VERIFICATION), reads.getValidationExclusionList().contains(ValidationExclusion.TYPE.NO_READ_ORDER_VERIFICATION),
reads.getSupplementalFilters()); reads.getSupplementalFilters());
} else if ((shard.getShardType() == Shard.ShardType.LOCUS_INTERVAL) || } else if ((shard.getShardType() == Shard.ShardType.LOCUS_INTERVAL) ||
@ -180,7 +180,7 @@ public class IndexDrivenSAMDataSource extends SAMDataSource {
iterator = seekLocus(shard); iterator = seekLocus(shard);
iterator = applyDecoratingIterators(false, iterator = applyDecoratingIterators(false,
iterator, iterator,
reads.getDownsamplingFraction(), reads.getDownsamplingMethod().toFraction,
reads.getValidationExclusionList().contains(ValidationExclusion.TYPE.NO_READ_ORDER_VERIFICATION), reads.getValidationExclusionList().contains(ValidationExclusion.TYPE.NO_READ_ORDER_VERIFICATION),
reads.getSupplementalFilters()); reads.getSupplementalFilters());

View File

@ -1,11 +1,9 @@
package org.broadinstitute.sting.gatk.executive; package org.broadinstitute.sting.gatk.executive;
import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; import org.broadinstitute.sting.gatk.iterators.*;
import org.broadinstitute.sting.gatk.iterators.LocusIterator;
import org.broadinstitute.sting.gatk.iterators.LocusIteratorByState;
import org.broadinstitute.sting.gatk.iterators.LocusOverflowTracker;
import org.broadinstitute.sting.gatk.Reads; import org.broadinstitute.sting.gatk.Reads;
import org.broadinstitute.sting.gatk.DownsampleType;
import org.broadinstitute.sting.gatk.traversals.TraversalStatistics; import org.broadinstitute.sting.gatk.traversals.TraversalStatistics;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
@ -57,15 +55,21 @@ public class WindowMaker implements Iterable<WindowMaker.WindowMakerIterator>, I
/** /**
* Create a new window maker with the given iterator as a data source, covering * Create a new window maker with the given iterator as a data source, covering
* the given inteervals. * the given intervals.
* @param iterator The data source for this window. * @param iterator The data source for this window.
* @param intervals The set of intervals over which to traverse. * @param intervals The set of intervals over which to traverse.
*/ */
public WindowMaker(StingSAMIterator iterator, List<GenomeLoc> intervals) { public WindowMaker(StingSAMIterator iterator, List<GenomeLoc> intervals) {
this.sourceInfo = iterator.getSourceInfo(); this.sourceInfo = iterator.getSourceInfo();
this.readIterator = iterator; this.readIterator = iterator;
LocusIterator locusIterator = new LocusIteratorByState(new FilteringIterator(iterator,new LocusStreamFilterFunc()),sourceInfo); LocusIterator locusIterator;
if(sourceInfo.getDownsamplingMethod() != null &&
(sourceInfo.getDownsamplingMethod().type == DownsampleType.EXPERIMENTAL_BY_SAMPLE || sourceInfo.getDownsamplingMethod().type == DownsampleType.EXPERIMENTAL_NAIVE_DUPLICATE_ELIMINATOR))
locusIterator = new DownsamplingLocusIteratorByState(new FilteringIterator(iterator,new LocusStreamFilterFunc()),sourceInfo);
else
locusIterator = new LocusIteratorByState(new FilteringIterator(iterator,new LocusStreamFilterFunc()),sourceInfo);
this.locusOverflowTracker = locusIterator.getLocusOverflowTracker(); this.locusOverflowTracker = locusIterator.getLocusOverflowTracker();
this.sourceIterator = new PeekableIterator<AlignmentContext>(locusIterator); this.sourceIterator = new PeekableIterator<AlignmentContext>(locusIterator);

View File

@ -30,6 +30,8 @@ import net.sf.picard.util.PeekableIterator;
import org.apache.log4j.Logger; import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.Reads; import org.broadinstitute.sting.gatk.Reads;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.DownsamplingMethod;
import org.broadinstitute.sting.gatk.DownsampleType;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.PileupElement;
@ -259,7 +261,7 @@ public class DownsamplingLocusIteratorByState extends LocusIterator {
// TODO: Push in header via constructor // TODO: Push in header via constructor
if(GenomeAnalysisEngine.instance.getDataSource() != null) if(GenomeAnalysisEngine.instance.getDataSource() != null)
sampleNames.addAll(SampleUtils.getSAMFileSamples(GenomeAnalysisEngine.instance.getSAMFileHeader())); sampleNames.addAll(SampleUtils.getSAMFileSamples(GenomeAnalysisEngine.instance.getSAMFileHeader()));
readStates = new ReadStateManager(samIterator,sampleNames,readInformation.getMaxReadsAtLocus()); readStates = new ReadStateManager(samIterator,readInformation.getDownsamplingMethod(),sampleNames);
this.readInfo = readInformation; this.readInfo = readInformation;
} }
@ -507,8 +509,10 @@ public class DownsamplingLocusIteratorByState extends LocusIterator {
private class ReadStateManager implements Iterable<SAMRecordState> { private class ReadStateManager implements Iterable<SAMRecordState> {
private final PeekableIterator<SAMRecord> iterator; private final PeekableIterator<SAMRecord> iterator;
private final DownsamplingMethod downsamplingMethod;
private final Map<String,ReservoirDownsampler<SAMRecord>> downsamplersBySampleName = new HashMap<String,ReservoirDownsampler<SAMRecord>>(); private final Map<String,ReservoirDownsampler<SAMRecord>> downsamplersBySampleName = new HashMap<String,ReservoirDownsampler<SAMRecord>>();
private final int maxReadsPerSample; private final int targetCoverage;
private final Deque<Map<String,List<SAMRecordState>>> readStatesByAlignmentStart; private final Deque<Map<String,List<SAMRecordState>>> readStatesByAlignmentStart;
@ -519,11 +523,16 @@ public class DownsamplingLocusIteratorByState extends LocusIterator {
*/ */
private Random downsampleRandomizer = new Random(38148309L); private Random downsampleRandomizer = new Random(38148309L);
public ReadStateManager(Iterator<SAMRecord> source, Collection<String> sampleNames, int maxReadsPerSample) { public ReadStateManager(Iterator<SAMRecord> source, DownsamplingMethod downsamplingMethod, Collection<String> sampleNames) {
this.iterator = new PeekableIterator<SAMRecord>(source); this.iterator = new PeekableIterator<SAMRecord>(source);
this.maxReadsPerSample = maxReadsPerSample; this.downsamplingMethod = downsamplingMethod;
for(String sampleName: sampleNames) this.targetCoverage = downsamplingMethod.toCoverage != null ? downsamplingMethod.toCoverage : 1;
downsamplersBySampleName.put(sampleName,new ReservoirDownsampler<SAMRecord>(maxReadsPerSample)); if(downsamplingMethod.type == DownsampleType.EXPERIMENTAL_NAIVE_DUPLICATE_ELIMINATOR)
downsamplersBySampleName.put(null,new ReservoirDownsampler<SAMRecord>(targetCoverage));
else {
for(String sampleName: sampleNames)
downsamplersBySampleName.put(sampleName,new ReservoirDownsampler<SAMRecord>(targetCoverage));
}
this.readStatesByAlignmentStart = new LinkedList<Map<String,List<SAMRecordState>>>(); this.readStatesByAlignmentStart = new LinkedList<Map<String,List<SAMRecordState>>>();
} }
@ -606,9 +615,23 @@ public class DownsamplingLocusIteratorByState extends LocusIterator {
} }
public void collectPendingReads() { public void collectPendingReads() {
while (iterator.hasNext() && !readIsPastCurrentPosition(iterator.peek())) { if(iterator.hasNext() && readStates.size() == 0) {
SAMRecord read = iterator.next(); int firstContigIndex = iterator.peek().getReferenceIndex();
downsamplersBySampleName.get(read.getReadGroup().getSample()).add(read); int firstAlignmentStart = iterator.peek().getAlignmentStart();
while(iterator.hasNext() && iterator.peek().getReferenceIndex() == firstContigIndex && iterator.peek().getAlignmentStart() == firstAlignmentStart) {
SAMRecord read = iterator.next();
getDownsampler(read.getReadGroup().getSample()).add(read);
}
}
else {
// Fast fail in the case that the read is past the current position.
if(iterator.hasNext() && readIsPastCurrentPosition(iterator.peek()))
return;
while (iterator.hasNext() && !readIsPastCurrentPosition(iterator.peek())) {
SAMRecord read = iterator.next();
getDownsampler(read.getReadGroup().getSample()).add(read);
}
} }
Map<String,List<SAMRecordState>> culledReadStatesBySample = new HashMap<String,List<SAMRecordState>>(); Map<String,List<SAMRecordState>> culledReadStatesBySample = new HashMap<String,List<SAMRecordState>>();
@ -621,14 +644,14 @@ public class DownsamplingLocusIteratorByState extends LocusIterator {
downsampler.clear(); downsampler.clear();
int readsInHanger = countReadsInHanger(sampleName); int readsInHanger = countReadsInHanger(sampleName);
if(readsInHanger+newReads.size() <= maxReadsPerSample) if(readsInHanger+newReads.size()<=targetCoverage || downsamplingMethod.type==DownsampleType.EXPERIMENTAL_NAIVE_DUPLICATE_ELIMINATOR)
addReadsToHanger(culledReadStatesBySample,sampleName,newReads,newReads.size()); addReadsToHanger(culledReadStatesBySample,sampleName,newReads,newReads.size());
else { else {
Iterator<Map<String,List<SAMRecordState>>> backIterator = readStatesByAlignmentStart.descendingIterator(); Iterator<Map<String,List<SAMRecordState>>> backIterator = readStatesByAlignmentStart.descendingIterator();
boolean readPruned = true; boolean readPruned = true;
while(readsInHanger+newReads.size()>maxReadsPerSample && readPruned) { while(readsInHanger+newReads.size()>targetCoverage && readPruned) {
readPruned = false; readPruned = false;
while(readsInHanger+newReads.size()>maxReadsPerSample && backIterator.hasNext()) { while(readsInHanger+newReads.size()>targetCoverage && backIterator.hasNext()) {
List<SAMRecordState> readsAtLocus = backIterator.next().get(sampleName); List<SAMRecordState> readsAtLocus = backIterator.next().get(sampleName);
if(readsAtLocus.size() > 1) { if(readsAtLocus.size() > 1) {
readsAtLocus.remove(downsampleRandomizer.nextInt(readsAtLocus.size())); readsAtLocus.remove(downsampleRandomizer.nextInt(readsAtLocus.size()));
@ -638,65 +661,24 @@ public class DownsamplingLocusIteratorByState extends LocusIterator {
} }
} }
if(readsInHanger == maxReadsPerSample) { if(readsInHanger == targetCoverage) {
Collection<SAMRecordState> firstHangerForSample = readStatesByAlignmentStart.getFirst().get(sampleName); Collection<SAMRecordState> firstHangerForSample = readStatesByAlignmentStart.getFirst().get(sampleName);
readsInHanger -= firstHangerForSample.size(); readsInHanger -= firstHangerForSample.size();
firstHangerForSample.clear(); firstHangerForSample.clear();
} }
addReadsToHanger(culledReadStatesBySample,sampleName,newReads,maxReadsPerSample-readsInHanger); addReadsToHanger(culledReadStatesBySample,sampleName,newReads,targetCoverage-readsInHanger);
} }
readStatesByAlignmentStart.add(culledReadStatesBySample); readStatesByAlignmentStart.add(culledReadStatesBySample);
} }
}
/* else { private ReservoirDownsampler<SAMRecord> getDownsampler(String sampleName) {
if() { if(downsamplingMethod.type == DownsampleType.EXPERIMENTAL_NAIVE_DUPLICATE_ELIMINATOR)
// Consume the collection of reads. return downsamplersBySampleName.get(null);
downsamplingIterator.next(); else
return downsamplersBySampleName.get(sampleName);
Map<String,Collection<SAMRecord>> newReadsBySample = new HashMap<String,Collection<SAMRecord>>();
Map<String,List<SAMRecordState>> culledReadStatesBySample = new HashMap<String,List<SAMRecordState>>();
for(String sampleName: sampleNames)
newReadsBySample.put(sampleName,getReadsForGivenSample(reads,sampleName));
for(String sampleName: newReadsBySample.keySet()) {
Collection<SAMRecord> newReads = newReadsBySample.get(sampleName);
int readsInHanger = countReadsInHanger(sampleName);
//if(readsInHanger+newReads.size() <= maxReadsPerSample)
addReadsToHanger(culledReadStatesBySample,sampleName,newReads,newReads.size());
Iterator<Map<String,List<SAMRecordState>>> backIterator = readStatesByAlignmentStart.descendingIterator();
boolean readPruned = true;
while(readsInHanger+newReads.size()>maxReadsPerSample && readPruned) {
readPruned = false;
while(readsInHanger+newReads.size()>maxReadsPerSample && backIterator.hasNext()) {
List<SAMRecordState> readsAtLocus = backIterator.next().get(sampleName);
if(readsAtLocus.size() > 1) {
readsAtLocus.remove(downsampleRandomizer.nextInt(readsAtLocus.size()));
readPruned = true;
readsInHanger--;
}
}
}
if(readsInHanger == maxReadsPerSample) {
Collection<SAMRecordState> firstHangerForSample = readStatesByAlignmentStart.getFirst().get(sampleName);
readsInHanger -= firstHangerForSample.size();
firstHangerForSample.clear();
}
addReadsToHanger(culledReadStatesBySample,sampleName,newReads,maxReadsPerSample-readsInHanger);
}
}
readStatesByAlignmentStart.add(culledReadStatesBySample);
}
else if(readIsPastCurrentPosition(reads.iterator().next()))
break;
}
*/
} }
private int countReadsInHanger() { private int countReadsInHanger() {
@ -737,9 +719,10 @@ public class DownsamplingLocusIteratorByState extends LocusIterator {
Iterator<Map<String,List<SAMRecordState>>> hangerIterator = readStatesByAlignmentStart.iterator(); Iterator<Map<String,List<SAMRecordState>>> hangerIterator = readStatesByAlignmentStart.iterator();
while(hangerIterator.hasNext()) { while(hangerIterator.hasNext()) {
Map<String,List<SAMRecordState>> hangerEntry = hangerIterator.next(); Map<String,List<SAMRecordState>> hangerEntry = hangerIterator.next();
for(String sampleName: sampleNames) { Iterator<List<SAMRecordState>> entryBySampleIterator = hangerEntry.values().iterator();
if(hangerEntry.containsKey(sampleName) && hangerEntry.get(sampleName).size() == 0) while(entryBySampleIterator.hasNext()) {
hangerEntry.remove(sampleName); if(entryBySampleIterator.next().size() == 0)
entryBySampleIterator.remove();
} }
if(hangerEntry.size() == 0) if(hangerEntry.size() == 0)
hangerIterator.remove(); hangerIterator.remove();

View File

@ -0,0 +1,22 @@
package org.broadinstitute.sting.gatk.walkers;
import org.broadinstitute.sting.gatk.DownsampleType;
import java.lang.annotation.*;
/**
* Specifies a method for downsampling the reads passed to a given
* walker based on the input from that walker.
*
* @author hanna
* @version 0.1
*/
@Documented
@Inherited
@Retention(RetentionPolicy.RUNTIME)
@Target(ElementType.TYPE)
public @interface Downsample {
DownsampleType by();
int toCoverage() default -1;
double toFraction() default -1.0F;
}

View File

@ -73,7 +73,7 @@ public class ReservoirDownsampler<T> implements Collection<T> {
* @return The downsampled contents of this reservoir. * @return The downsampled contents of this reservoir.
*/ */
public Collection<T> getDownsampledContents() { public Collection<T> getDownsampledContents() {
return Collections.unmodifiableCollection(reservoir); return (Collection<T>)reservoir.clone();
} }
@Override @Override

View File

@ -92,7 +92,6 @@ public class GATKArgumentCollectionUnitTest extends BaseTest {
collect.intervals = new ArrayList<String>(); collect.intervals = new ArrayList<String>();
collect.intervals.add("intervals".toLowerCase()); collect.intervals.add("intervals".toLowerCase());
collect.excludeIntervals = new ArrayList<String>(); collect.excludeIntervals = new ArrayList<String>();
collect.disableThreading = false;
collect.outFileName = "outFileName".toLowerCase(); collect.outFileName = "outFileName".toLowerCase();
collect.errFileName = "errFileName".toLowerCase(); collect.errFileName = "errFileName".toLowerCase();
collect.outErrFileName = "outErrFileName".toLowerCase(); collect.outErrFileName = "outErrFileName".toLowerCase();