Optimization to TraverseReadsNano

-- Don't just read all inputs into a list, and then provide an iterator to that list, actually make a real iterator so NanoScheduler input thread can contribute meaningfully to the work load
-- Use NanoScheduler progress function, instead of home-grown updater
This commit is contained in:
Mark DePristo 2012-12-17 16:40:04 -05:00
parent b33f804cdc
commit 16eb1c5436
1 changed files with 37 additions and 27 deletions

View File

@ -33,14 +33,13 @@ import org.broadinstitute.sting.gatk.datasources.providers.ReadShardDataProvider
import org.broadinstitute.sting.gatk.datasources.providers.ReadView;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.nanoScheduler.NSMapFunction;
import org.broadinstitute.sting.utils.nanoScheduler.NSProgressFunction;
import org.broadinstitute.sting.utils.nanoScheduler.NSReduceFunction;
import org.broadinstitute.sting.utils.nanoScheduler.NanoScheduler;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import java.util.LinkedList;
import java.util.List;
import java.util.Iterator;
/**
* A nano-scheduling version of TraverseReads.
@ -60,6 +59,13 @@ public class TraverseReadsNano<M,T> extends TraversalEngine<M,T,ReadWalker<M,T>,
public TraverseReadsNano(int nThreads) {
nanoScheduler = new NanoScheduler<MapData, MapResult, T>(nThreads);
nanoScheduler.setProgressFunction(new NSProgressFunction<MapData>() {
@Override
public void progress(MapData lastProcessedMap) {
if ( lastProcessedMap.refContext != null )
printProgress(lastProcessedMap.refContext.getLocus());
}
});
}
@Override
@ -88,14 +94,10 @@ public class TraverseReadsNano<M,T> extends TraversalEngine<M,T,ReadWalker<M,T>,
final TraverseReadsMap myMap = new TraverseReadsMap(walker);
final TraverseReadsReduce myReduce = new TraverseReadsReduce(walker);
final List<MapData> aggregatedInputs = aggregateMapData(dataProvider);
final T result = nanoScheduler.execute(aggregatedInputs.iterator(), myMap, sum, myReduce);
final GATKSAMRecord lastRead = aggregatedInputs.get(aggregatedInputs.size() - 1).read;
final GenomeLoc locus = engine.getGenomeLocParser().createGenomeLoc(lastRead);
final Iterator<MapData> aggregatedInputs = aggregateMapData(dataProvider);
final T result = nanoScheduler.execute(aggregatedInputs, myMap, sum, myReduce);
updateCumulativeMetrics(dataProvider.getShard());
printProgress(locus);
return result;
}
@ -108,29 +110,37 @@ public class TraverseReadsNano<M,T> extends TraversalEngine<M,T,ReadWalker<M,T>,
* @return a linked list of MapData objects holding the read, ref, and ROD info for every map/reduce
* should execute
*/
private List<MapData> aggregateMapData(final ReadShardDataProvider dataProvider) {
final ReadView reads = new ReadView(dataProvider);
final ReadReferenceView reference = new ReadReferenceView(dataProvider);
final ReadBasedReferenceOrderedView rodView = new ReadBasedReferenceOrderedView(dataProvider);
private Iterator<MapData> aggregateMapData(final ReadShardDataProvider dataProvider) {
return new Iterator<MapData>() {
final ReadView reads = new ReadView(dataProvider);
final ReadReferenceView reference = new ReadReferenceView(dataProvider);
final ReadBasedReferenceOrderedView rodView = new ReadBasedReferenceOrderedView(dataProvider);
final Iterator<SAMRecord> readIterator = reads.iterator();
final List<MapData> mapData = new LinkedList<MapData>();
for ( final SAMRecord read : reads ) {
final ReferenceContext refContext = ! read.getReadUnmappedFlag()
? reference.getReferenceContext(read)
: null;
@Override public boolean hasNext() { return readIterator.hasNext(); }
// if the read is mapped, create a metadata tracker
final RefMetaDataTracker tracker = read.getReferenceIndex() >= 0
? rodView.getReferenceOrderedDataForRead(read)
: null;
@Override
public MapData next() {
final SAMRecord read = readIterator.next();
final ReferenceContext refContext = ! read.getReadUnmappedFlag()
? reference.getReferenceContext(read)
: null;
// update the number of reads we've seen
dataProvider.getShard().getReadMetrics().incrementNumIterations();
// if the read is mapped, create a metadata tracker
final RefMetaDataTracker tracker = read.getReferenceIndex() >= 0
? rodView.getReferenceOrderedDataForRead(read)
: null;
mapData.add(new MapData((GATKSAMRecord)read, refContext, tracker));
}
// update the number of reads we've seen
dataProvider.getShard().getReadMetrics().incrementNumIterations();
return mapData;
return new MapData((GATKSAMRecord)read, refContext, tracker);
}
@Override public void remove() {
throw new UnsupportedOperationException("Remove not supported");
}
};
}
@Override