From 24c84da60de5c8769e23a83231252bd646d2d258 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Fri, 23 Dec 2011 15:39:12 -0500 Subject: [PATCH] 'Fixing' the changes in ReferenceDataSource so that a shard properly contains a list of GenomeLocs instead of a single merged one. However, that uncovered a probable bug in the engine, so instead of letting this code fester unfixed in the build (affecting everyone in the group) I've decided to revert the previous (slow, but working) version and fix the engine in my own branch. --- .../reference/ReferenceDataSource.java | 65 +++++++++++++------ 1 file changed, 46 insertions(+), 19 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reference/ReferenceDataSource.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reference/ReferenceDataSource.java index be33a5691..4ecfe472d 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reference/ReferenceDataSource.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reference/ReferenceDataSource.java @@ -45,6 +45,7 @@ import org.broadinstitute.sting.utils.file.FileSystemInabilityToLockException; import java.io.File; import java.util.ArrayList; import java.util.Collections; +import java.util.LinkedList; import java.util.List; /** @@ -226,6 +227,28 @@ public class ReferenceDataSource { return shards; } + + public Iterable createShardsOverIntervals(final SAMDataSource readsDataSource, final GenomeLocSortedSet intervals, final int maxShardSize) { + List shards = new ArrayList(); + + for(GenomeLoc interval: intervals) { + while(interval.size() > maxShardSize) { + shards.add(new LocusShard(intervals.getGenomeLocParser(), + readsDataSource, + Collections.singletonList(intervals.getGenomeLocParser().createGenomeLoc(interval.getContig(),interval.getStart(),interval.getStart()+maxShardSize-1)), + null)); + interval = intervals.getGenomeLocParser().createGenomeLoc(interval.getContig(),interval.getStart()+maxShardSize,interval.getStop()); + } + shards.add(new LocusShard(intervals.getGenomeLocParser(), + readsDataSource, + Collections.singletonList(interval), + null)); + } + + return shards; + } + + /** * Creates an iterator for processing the entire reference. * @param readsDataSource the reads datasource to embed in the locus shard. TODO: decouple the creation of the shards themselves from the creation of the driving iterator so that datasources need not be passed to datasources. @@ -233,46 +256,50 @@ public class ReferenceDataSource { * @param targetShardSize the suggested - and maximum - shard size which can be used to create this list; we will merge intervals greedily so that we generate shards up to but not greater than the target size. * @return Creates a schedule for performing a traversal over the entire reference. */ +/* public Iterable createShardsOverIntervals(final SAMDataSource readsDataSource, final GenomeLocSortedSet intervals, final int targetShardSize) { final List shards = new ArrayList(); final GenomeLocParser parser = intervals.getGenomeLocParser(); - GenomeLoc currentInterval = null; + LinkedList currentIntervals = new LinkedList(); for(GenomeLoc interval: intervals) { // if the next interval is too big, we can safely shard currentInterval and then break down this one if (interval.size() > targetShardSize) { - if (currentInterval != null) - shards.add(createShardFromInterval(currentInterval, readsDataSource, parser)); - currentInterval = interval; - while(currentInterval.size() > targetShardSize) { - final GenomeLoc partialInterval = parser.createGenomeLoc(currentInterval.getContig(), currentInterval.getStart(), currentInterval.getStart()+targetShardSize-1); - shards.add(createShardFromInterval(partialInterval, readsDataSource, parser)); - currentInterval = parser.createGenomeLoc(currentInterval.getContig(),currentInterval.getStart()+targetShardSize,currentInterval.getStop()); + if (!currentIntervals.isEmpty()) + shards.add(createShardFromInterval(currentIntervals, readsDataSource, parser)); + while(interval.size() > targetShardSize) { + final GenomeLoc partialInterval = parser.createGenomeLoc(interval.getContig(), interval.getStart(), interval.getStart()+targetShardSize-1); + shards.add(createShardFromInterval(Collections.singletonList(partialInterval), readsDataSource, parser)); + interval = parser.createGenomeLoc(interval.getContig(), interval.getStart() + targetShardSize, interval.getStop()); } + currentIntervals = new LinkedList(); + currentIntervals.add(interval); } // otherwise, we need to check whether we can merge this interval with currentInterval (and either shard currentInterval or merge accordingly) else { - if (currentInterval == null) { - currentInterval = interval; + if (currentIntervals.isEmpty()) { + currentIntervals.add(interval); } - else if (currentInterval.compareContigs(interval) != 0 || interval.getStop() - currentInterval.getStart() + 1 > targetShardSize) { - shards.add(createShardFromInterval(currentInterval, readsDataSource, parser)); - currentInterval = interval; - } else { - currentInterval = parser.createGenomeLoc(currentInterval.getContig(),currentInterval.getStart(),interval.getStop()); + else { + if (currentIntervals.getLast().compareContigs(interval) != 0 || interval.getStop() - currentIntervals.getLast().getStart() + 1 > targetShardSize) { + shards.add(createShardFromInterval(currentIntervals, readsDataSource, parser)); + currentIntervals = new LinkedList(); + } + currentIntervals.add(interval); } } } - if (currentInterval != null) - shards.add(createShardFromInterval(currentInterval, readsDataSource, parser)); + if (!currentIntervals.isEmpty()) + shards.add(createShardFromInterval(currentIntervals, readsDataSource, parser)); return shards; } - private static Shard createShardFromInterval(final GenomeLoc interval, final SAMDataSource readsDataSource, final GenomeLocParser parser) { + private static Shard createShardFromInterval(final List intervals, final SAMDataSource readsDataSource, final GenomeLocParser parser) { //logger.debug("Adding shard " + interval); return new LocusShard(parser, readsDataSource, - Collections.singletonList(interval), + intervals, null); } +*/ }