diff --git a/java/src/net/sf/samtools/BAMBlockIterator.java b/java/src/net/sf/samtools/BAMBlockIterator.java deleted file mode 100644 index b87bce8ee..000000000 --- a/java/src/net/sf/samtools/BAMBlockIterator.java +++ /dev/null @@ -1,95 +0,0 @@ -package net.sf.samtools; - -import net.sf.samtools.util.CloseableIterator; -import net.sf.picard.PicardException; - -import java.util.NoSuchElementException; -import java.io.IOException; -import java.io.FileInputStream; -import java.io.File; - -/** - * Walks over a BAM file, discovering and returning the starting location of each block - * in chunk format. - * - * @author mhanna - * @version 0.1 - */ -public class BAMBlockIterator implements CloseableIterator { - /** - * The input stream for back files. - */ - private FileInputStream inputStream; - - /** - * File channel from which to read chunks. - */ - private BlockReader blockReader; - - /** - * What is the current position of this block within the BAM file? - */ - private long position = 0; - - /** - * Iterate through the BAM chunks in a file. - * @param file stream File to use when accessing the BAM. - * @throws IOException in case of problems reading from the file. - */ - public BAMBlockIterator(File file) throws IOException { - inputStream = new FileInputStream(file); - blockReader = new BlockReader(inputStream); - } - - /** - * Close the existing file. - */ - public void close() { - try { - inputStream.close(); - } - catch(IOException ex) { - throw new PicardException("Unable to close file", ex); - } - } - - /** - * Are there other chunks to retrieve in this file? - * @return True if other chunks are available, false otherwise. - */ - public boolean hasNext() { - try { - return !blockReader.eof(position); - } catch(IOException ex) { - throw new SAMException("Unable to check file for a next BAM record", ex); - } - } - - /** - * Get the next chunk from the iterator. - * @return The next chunk. - * @throws NoSuchElementException if no next chunk is available. - */ - public Block next() { - if(!hasNext()) - throw new NoSuchElementException("No next chunk is available."); - - Block block = null; - try { - block = blockReader.getBlockAt(position); - position = block.position + block.compressedBlockSize; - } - catch(IOException ex) { - throw new SAMException("Unable to completely read chunk at end of file.", ex); - } - return block; - } - - /** - * Remove a chunk from the iterator. - * @throws UnsupportedOperationException always. - */ - public void remove() { - throw new UnsupportedOperationException("BAMChunkIterator: Cannot remove a BAM chunk."); - } -} diff --git a/java/src/net/sf/samtools/BAMChunkIterator.java b/java/src/net/sf/samtools/BAMChunkIterator.java deleted file mode 100644 index 903a067bd..000000000 --- a/java/src/net/sf/samtools/BAMChunkIterator.java +++ /dev/null @@ -1,88 +0,0 @@ -package net.sf.samtools; - -import java.util.*; - -/** - * Adapts a list of BAM blocks to BAM chunks. - * - * @author mhanna - * @version 0.1 - */ -public class BAMChunkIterator implements Iterator { - /** - * The wrapped chunk iterator. - */ - private final BAMBlockIterator blockIterator; - - /** - * List of prefetched segments. If prefetchedSegments is - * empty, this should mean that there's nothing left in the queue. - */ - private Queue prefetchedSegments; - - /** - * List of chunks to filter out of the final chunk list. - */ - private final Queue filters; - - /** - * Creates a new chunk iterator wrapping the given block, optionally filtering - * out a list of chunks from those blocks. - * TODO: This class is too smart. Wrap filtering functionality in another class. - * @param blockIterator block iterator to wrap. - * @param filters List of chunks to filter out of the given input stream. - */ - public BAMChunkIterator(BAMBlockIterator blockIterator, List filters) { - this.blockIterator = blockIterator; - this.prefetchedSegments = new LinkedList(); - this.filters = new PriorityQueue(filters); - seedNextSegments(); - } - - /** - * Is a next chunk available. - * @return true if a next chunk is available; false otherwise. - */ - public boolean hasNext() { - return !prefetchedSegments.isEmpty(); - } - - /** - * Returns the next chunk in the list. - * @return Next block, adapted to a chunk. - */ - public Chunk next() { - if(prefetchedSegments.isEmpty()) - throw new NoSuchElementException("BAMChunkIterator is exhausted."); - // TODO: Maybe tie together adjacent chunks together? - Chunk nextChunk = prefetchedSegments.remove().toChunk(); - seedNextSegments(); - return nextChunk; - } - - /** - * @throws UnsupportedOperationException always. - */ - public void remove() { - throw new UnsupportedOperationException("Cannot remove from a BAM chunk iterator"); - } - - /** - * Seed the next value or set of values for the iterator, if necessary. - */ - private void seedNextSegments() { - while(prefetchedSegments.isEmpty() && blockIterator.hasNext()) { - // Add the new segments to the prefetch... - prefetchedSegments.add(new BlockSegment(blockIterator.next())); - - // And iteratively subtract the filtering chunks. - for(Chunk filter: filters) { - Queue filteredBlockSegments = new LinkedList(); - for(BlockSegment blockSegment: prefetchedSegments) - filteredBlockSegments.addAll(blockSegment.minus(filter)); - prefetchedSegments = filteredBlockSegments; - } - } - } - -} diff --git a/java/src/net/sf/samtools/BAMFileHeaderLoader.java b/java/src/net/sf/samtools/BAMFileHeaderLoader.java deleted file mode 100644 index 0b39ddd33..000000000 --- a/java/src/net/sf/samtools/BAMFileHeaderLoader.java +++ /dev/null @@ -1,127 +0,0 @@ -package net.sf.samtools; - -import net.sf.samtools.util.BlockCompressedInputStream; -import net.sf.samtools.util.BinaryCodec; -import net.sf.samtools.util.StringLineReader; - -import java.io.File; -import java.io.IOException; -import java.io.DataInputStream; -import java.util.Arrays; -import java.util.List; -import java.util.ArrayList; - -/** - * Loads a BAM file header from an file, optionally providing its position - * within the file. - * - * @author mhanna - * @version 0.1 - */ -public class BAMFileHeaderLoader { - /** - * The contents of the BAM file header. - */ - private final SAMFileHeader header; - - /** - * Location of the header within the BAM file. - */ - private final Chunk location; - - public static final Chunk preambleLocation = new Chunk(0<<16 | 0, 0<<16 | 3); - - /** - * Load the header from the given file. - * @param header the parsed haeder for the BAM file. - * @param location the location of the header (start and stop) within the BAM. - */ - private BAMFileHeaderLoader(SAMFileHeader header, Chunk location) { - this.header = header; - this.location = location; - } - - /** - * Gets the header for the given BAM file. - * @return The header for this BAM file. - */ - public SAMFileHeader getHeader() { - return header; - } - - /** - * Gets the location of the header within the given BAM file, in chunk format. - * @return the location of the header, in chunk coordinates. - */ - public Chunk getLocation() { - return location; - } - - public static BAMFileHeaderLoader load(File file) throws IOException { - BlockCompressedInputStream inputStream = new BlockCompressedInputStream(file); - BinaryCodec binaryCodec = new BinaryCodec(new DataInputStream(inputStream)); - - final byte[] buffer = new byte[4]; - binaryCodec.readBytes(buffer); - if (!Arrays.equals(buffer, BAMFileConstants.BAM_MAGIC)) { - throw new IOException("Invalid BAM file header"); - } - - final int headerTextLength = binaryCodec.readInt(); - final String textHeader = binaryCodec.readString(headerTextLength); - final SAMTextHeaderCodec headerCodec = new SAMTextHeaderCodec(); - headerCodec.setValidationStringency(SAMFileReader.ValidationStringency.SILENT); - SAMFileHeader header = headerCodec.decode(new StringLineReader(textHeader),file.getAbsolutePath()); - - // directly copied from BAMFileReader... - final int sequenceCount = binaryCodec.readInt(); - if (header.getSequenceDictionary().size() > 0) { - // It is allowed to have binary sequences but no text sequences, so only validate if both are present - if (sequenceCount != header.getSequenceDictionary().size()) { - throw new SAMFormatException("Number of sequences in text header (" + - header.getSequenceDictionary().size() + - ") != number of sequences in binary header (" + sequenceCount + ") for file " + file); - } - for (int i = 0; i < sequenceCount; i++) { - final SAMSequenceRecord binarySequenceRecord = readSequenceRecord(binaryCodec,file); - final SAMSequenceRecord sequenceRecord = header.getSequence(i); - if (!sequenceRecord.getSequenceName().equals(binarySequenceRecord.getSequenceName())) { - throw new SAMFormatException("For sequence " + i + ", text and binary have different names in file " + - binaryCodec); - } - if (sequenceRecord.getSequenceLength() != binarySequenceRecord.getSequenceLength()) { - throw new SAMFormatException("For sequence " + i + ", text and binary have different lengths in file " + - binaryCodec); - } - } - } else { - // If only binary sequences are present, copy them into mFileHeader - final List sequences = new ArrayList(sequenceCount); - for (int i = 0; i < sequenceCount; i++) { - sequences.add(readSequenceRecord(binaryCodec,file)); - } - header.setSequenceDictionary(new SAMSequenceDictionary(sequences)); - } - inputStream.close(); - - return new BAMFileHeaderLoader(header,new Chunk(buffer.length,inputStream.getFilePointer()-1)); - } - - /** - * Reads a single binary sequence record from the file or stream - * @param binaryCodec stream to read from. - * @param file Note that this is used only for reporting errors. - * @return an individual sequence record. - */ - private static SAMSequenceRecord readSequenceRecord(final BinaryCodec binaryCodec, final File file) { - final int nameLength = binaryCodec.readInt(); - if (nameLength <= 1) { - throw new SAMFormatException("Invalid BAM file header: missing sequence name in file " + file.getAbsolutePath()); - } - final String sequenceName = binaryCodec.readString(nameLength - 1); - // Skip the null terminator - binaryCodec.readByte(); - final int sequenceLength = binaryCodec.readInt(); - return new SAMSequenceRecord(sequenceName, sequenceLength); - } -} diff --git a/java/src/net/sf/samtools/BlockTestHarness.java b/java/src/net/sf/samtools/BlockTestHarness.java deleted file mode 100644 index 52386b282..000000000 --- a/java/src/net/sf/samtools/BlockTestHarness.java +++ /dev/null @@ -1,56 +0,0 @@ -package net.sf.samtools; - -import java.io.File; -import java.io.IOException; -import java.util.Collections; - -/** - * Test harness for playing with sharding by BGZF block. - * - * @author mhanna - * @version 0.1 - */ -public class BlockTestHarness { - private static void usage() { - System.out.println("Usage: ChunkTestHarness .bam"); - System.exit(1); - } - - public static void main(String args[]) throws IOException { - if(args.length == 0) - usage(); - - String bamFileName = args[0]; - if(!bamFileName.endsWith(".bam")) - usage(); - - File bamFile = new File(bamFileName); - if(!bamFile.exists()) - usage(); - - Chunk headerLocation = BAMFileHeaderLoader.load(bamFile).getLocation(); - System.out.printf("Header location = %s%n", headerLocation); - - BAMChunkIterator chunkIterator = new BAMChunkIterator(new BAMBlockIterator(bamFile), - Collections.singletonList(headerLocation)); - while(chunkIterator.hasNext()) { - Chunk chunk = chunkIterator.next(); - } - - // The following test code blocks a bunch of files together. - /* - BAMBlockIterator blockIterator = new BAMBlockIterator(bamFile); - long blockCount = 0; - - long startTime = System.currentTimeMillis(); - while(blockIterator.hasNext()) { - Block block = blockIterator.next(); - blockCount++; - System.out.println(block); - } - long endTime = System.currentTimeMillis(); - - System.out.printf("Number of chunks: %d; elapsed time: %dms%n", blockCount, endTime-startTime); - */ - } -} diff --git a/java/src/net/sf/samtools/Chunk.java b/java/src/net/sf/samtools/Chunk.java index ee9ae4a5d..949ec8879 100644 --- a/java/src/net/sf/samtools/Chunk.java +++ b/java/src/net/sf/samtools/Chunk.java @@ -30,7 +30,7 @@ public class Chunk implements Cloneable,Comparable { return mChunkStart; } - void setChunkStart(final long value) { + public void setChunkStart(final long value) { mChunkStart = value; } @@ -38,7 +38,7 @@ public class Chunk implements Cloneable,Comparable { return mChunkEnd; } - void setChunkEnd(final long value) { + public void setChunkEnd(final long value) { mChunkEnd = value; } diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/BAMFileStat.java b/java/src/org/broadinstitute/sting/gatk/datasources/BAMFileStat.java index c3841359e..8e0b84338 100644 --- a/java/src/org/broadinstitute/sting/gatk/datasources/BAMFileStat.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/BAMFileStat.java @@ -47,8 +47,7 @@ public class BAMFileStat extends CommandLineProgram { switch(command) { case ShowBlocks: - showBlocks(new File(bamFileName),startPosition,stopPosition); - break; + throw new StingException("The BAM block inspector has been disabled."); case ShowIndex: showIndexBins(new File(bamFileName+".bai")); break; @@ -70,26 +69,6 @@ public class BAMFileStat extends CommandLineProgram { } } - private void showBlocks(File bamFile, Integer startPosition, Integer stopPosition) { - int blockNumber = 0; - - try { - BAMBlockIterator iterator = new BAMBlockIterator(bamFile); - while(iterator.hasNext()) { - Block block = iterator.next(); - blockNumber++; - - if(startPosition != null && startPosition > blockNumber) continue; - if(stopPosition != null && stopPosition < blockNumber) break; - - System.out.printf("Block number = %d; position = %d; compressed size = %d; uncompressed size = %d%n", blockNumber, block.position, block.compressedBlockSize, block.uncompressedBlockSize); - } - } - catch(IOException ex) { - throw new StingException("Unable to open BAM file"); - } - } - private void showIndexBins(File bamFile) { BAMFileIndexContentInspector inspector = new BAMFileIndexContentInspector(bamFile); inspector.inspect(System.out,null,null); diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/shards/BlockDelimitedReadShard.java b/java/src/org/broadinstitute/sting/gatk/datasources/shards/BlockDelimitedReadShard.java index 00426bb15..e58551d2c 100644 --- a/java/src/org/broadinstitute/sting/gatk/datasources/shards/BlockDelimitedReadShard.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/shards/BlockDelimitedReadShard.java @@ -34,9 +34,16 @@ public class BlockDelimitedReadShard extends ReadShard implements BAMFormatAware */ private final Collection reads = new ArrayList(BlockDelimitedReadShardStrategy.MAX_READS); - public BlockDelimitedReadShard(Reads sourceInfo, Map> chunks) { + /** + * An BlockDelimitedLocusShard can be used either for READ or READ shard types. + * Track which type is being used. + */ + private final Shard.ShardType shardType; + + public BlockDelimitedReadShard(Reads sourceInfo, Map> chunks, Shard.ShardType shardType) { this.sourceInfo = sourceInfo; this.chunks = chunks; + this.shardType = shardType; } /** diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/shards/BlockDelimitedReadShardStrategy.java b/java/src/org/broadinstitute/sting/gatk/datasources/shards/BlockDelimitedReadShardStrategy.java index 3a8808dcf..506f3cecf 100644 --- a/java/src/org/broadinstitute/sting/gatk/datasources/shards/BlockDelimitedReadShardStrategy.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/shards/BlockDelimitedReadShardStrategy.java @@ -5,6 +5,7 @@ import net.sf.samtools.*; import java.util.*; import org.broadinstitute.sting.utils.StingException; +import org.broadinstitute.sting.utils.GenomeLocSortedSet; import org.broadinstitute.sting.gatk.datasources.simpleDataSources.SAMDataSource; import org.broadinstitute.sting.gatk.datasources.simpleDataSources.BlockDrivenSAMDataSource; @@ -26,6 +27,9 @@ public class BlockDelimitedReadShardStrategy extends ReadShardStrategy { */ protected final BlockDrivenSAMDataSource dataSource; + /** our storage of the genomic locations they'd like to shard over */ + private final List filePointers = new ArrayList(); + /** * Position of the last shard in the file. */ @@ -40,12 +44,14 @@ public class BlockDelimitedReadShardStrategy extends ReadShardStrategy { * Create a new read shard strategy, loading read shards from the given BAM file. * @param dataSource Data source from which to load shards. */ - public BlockDelimitedReadShardStrategy(SAMDataSource dataSource) { + public BlockDelimitedReadShardStrategy(SAMDataSource dataSource, GenomeLocSortedSet locations) { if(!(dataSource instanceof BlockDrivenSAMDataSource)) throw new StingException("Block-delimited read shard strategy cannot work without a block-driven data source."); this.dataSource = (BlockDrivenSAMDataSource)dataSource; - this.position = this.dataSource.getCurrentPosition(); + this.position = this.dataSource.getCurrentPosition(); + if(locations != null) + filePointers.addAll(IntervalSharder.shardIntervals(this.dataSource,locations.toList(),this.dataSource.getNumIndexLevels()-1)); } /** @@ -65,17 +71,41 @@ public class BlockDelimitedReadShardStrategy extends ReadShardStrategy { if(!hasNext()) throw new NoSuchElementException("No such element available: SAM reader has arrived at last shard."); - // TODO: This level of processing should not be necessary. - Map> boundedPosition = new HashMap>(); - for(Map.Entry entry: position.entrySet()) - boundedPosition.put(entry.getKey(),Collections.singletonList(entry.getValue())); + Map> shardPosition = null; + if(!filePointers.isEmpty()) { + boolean foundData = false; + for(FilePointer filePointer: filePointers) { + shardPosition = dataSource.getFilePointersBounding(filePointer.bin); + for(SAMFileReader2 reader: shardPosition.keySet()) { + List chunks = shardPosition.get(reader); + Chunk filePosition = position.get(reader); + for(Chunk chunk: chunks) { + if(filePosition.getChunkStart() > chunk.getChunkEnd()) + chunks.remove(chunk); + else { + if(filePosition.getChunkStart() > chunk.getChunkStart()) + chunk.setChunkStart(filePosition.getChunkStart()); + foundData = true; + } + } + } + if(foundData) + break; + } + } + else { + // TODO: This level of processing should not be necessary. + shardPosition = new HashMap>(); + for(Map.Entry entry: position.entrySet()) + shardPosition.put(entry.getKey(),Collections.singletonList(entry.getValue())); + } - BAMFormatAwareShard shard = new BlockDelimitedReadShard(dataSource.getReadsInfo(),boundedPosition); + BAMFormatAwareShard shard = new BlockDelimitedReadShard(dataSource.getReadsInfo(),shardPosition,Shard.ShardType.READ); atEndOfStream = dataSource.fillShard(shard); this.position = dataSource.getCurrentPosition(); - return shard; + return shard; } /** diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/shards/IndexDelimitedLocusShardStrategy.java b/java/src/org/broadinstitute/sting/gatk/datasources/shards/IndexDelimitedLocusShardStrategy.java index 266887914..05ad02214 100755 --- a/java/src/org/broadinstitute/sting/gatk/datasources/shards/IndexDelimitedLocusShardStrategy.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/shards/IndexDelimitedLocusShardStrategy.java @@ -1,16 +1,13 @@ package org.broadinstitute.sting.gatk.datasources.shards; -import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocSortedSet; import org.broadinstitute.sting.utils.StingException; -import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.gatk.datasources.simpleDataSources.SAMDataSource; import org.broadinstitute.sting.gatk.datasources.simpleDataSources.BlockDrivenSAMDataSource; import java.util.*; import net.sf.samtools.Chunk; -import net.sf.samtools.Bin; import net.sf.samtools.SAMFileReader2; /* @@ -46,7 +43,7 @@ public class IndexDelimitedLocusShardStrategy implements ShardStrategy { /** * The data source to use when performing this sharding. */ - private final BlockDrivenSAMDataSource blockDrivenDataSource; + private final BlockDrivenSAMDataSource dataSource; /** our storage of the genomic locations they'd like to shard over */ private final List filePointers = new ArrayList(); @@ -65,108 +62,11 @@ public class IndexDelimitedLocusShardStrategy implements ShardStrategy { if(!(dataSource instanceof BlockDrivenSAMDataSource)) throw new StingException("Cannot power an IndexDelimitedLocusShardStrategy with this data source."); - blockDrivenDataSource = (BlockDrivenSAMDataSource)dataSource; - filePointers.addAll(batchLociIntoBins(locations.toList(),blockDrivenDataSource.getNumIndexLevels()-1)); + this.dataSource = (BlockDrivenSAMDataSource)dataSource; + filePointers.addAll(IntervalSharder.shardIntervals(this.dataSource,locations.toList(),this.dataSource.getNumIndexLevels()-1)); filePointerIterator = filePointers.iterator(); } - private List batchLociIntoBins(final List loci, final int binsDeeperThan) { - // Gather bins for the given loci, splitting loci as necessary so that each falls into exactly one lowest-level bin. - List filePointers = new ArrayList(); - FilePointer filePointer = null; - - for(GenomeLoc location: loci) { - int locationStart = (int)location.getStart(); - final int locationStop = (int)location.getStop(); - - List bins = findBinsAtLeastAsDeepAs(blockDrivenDataSource.getOverlappingBins(location),binsDeeperThan); - - // Recursive stopping condition -- algorithm is at the zero point and no bins have been found. - if(binsDeeperThan == 0 && bins.size() == 0) { - filePointers.add(new FilePointer(location)); - continue; - } - - // No bins found; step up a level and search again. - if(bins.size() == 0) { - if(filePointer != null && filePointer.locations.size() > 0) { - filePointers.add(filePointer); - filePointer = null; - } - - filePointers.addAll(batchLociIntoBins(Collections.singletonList(location),binsDeeperThan-1)); - continue; - } - - // Bins found; try to match bins with locations. - Collections.sort(bins); - Iterator binIterator = bins.iterator(); - - while(locationStop >= locationStart) { - int binStart = filePointer!=null ? blockDrivenDataSource.getFirstLocusInBin(filePointer.bin) : 0; - int binStop = filePointer!=null ? blockDrivenDataSource.getLastLocusInBin(filePointer.bin) : 0; - - while(binStop < locationStart && binIterator.hasNext()) { - if(filePointer != null && filePointer.locations.size() > 0) - filePointers.add(filePointer); - - filePointer = new FilePointer(binIterator.next()); - binStart = blockDrivenDataSource.getFirstLocusInBin(filePointer.bin); - binStop = blockDrivenDataSource.getLastLocusInBin(filePointer.bin); - } - - if(locationStart < binStart) { - // The region starts before the first bin in the sequence. Add the region occurring before the sequence. - if(filePointer != null && filePointer.locations.size() > 0) { - filePointers.add(filePointer); - filePointer = null; - } - - final int regionStop = Math.min(locationStop,binStart-1); - - GenomeLoc subset = GenomeLocParser.createGenomeLoc(location.getContig(),locationStart,regionStop); - filePointers.addAll(batchLociIntoBins(Collections.singletonList(subset),binsDeeperThan-1)); - - locationStart = regionStop + 1; - } - else if(locationStart > binStop) { - // The region starts after the last bin in the sequence. Add the region occurring after the sequence. - if(filePointer != null && filePointer.locations.size() > 0) { - filePointers.add(filePointer); - filePointer = null; - } - - GenomeLoc subset = GenomeLocParser.createGenomeLoc(location.getContig(),locationStart,locationStop); - filePointers.addAll(batchLociIntoBins(Collections.singletonList(subset),binsDeeperThan-1)); - - locationStart = locationStop + 1; - } - else { - // The start of the region overlaps the bin. Add the overlapping subset. - final int regionStop = Math.min(locationStop,binStop); - filePointer.addLocation(GenomeLocParser.createGenomeLoc(location.getContig(), - locationStart, - regionStop)); - locationStart = regionStop + 1; - } - } - } - - if(filePointer != null && filePointer.locations.size() > 0) - filePointers.add(filePointer); - - return filePointers; - } - - private List findBinsAtLeastAsDeepAs(final List bins, final int deepestBinLevel) { - List deepestBins = new ArrayList(); - for(Bin bin: bins) { - if(blockDrivenDataSource.getLevelForBin(bin) >= deepestBinLevel) - deepestBins.add(bin); - } - return deepestBins; - } - /** * returns true if there are additional shards * @@ -183,7 +83,7 @@ public class IndexDelimitedLocusShardStrategy implements ShardStrategy { */ public IndexDelimitedLocusShard next() { FilePointer nextFilePointer = filePointerIterator.next(); - Map> chunksBounding = blockDrivenDataSource.getFilePointersBounding(nextFilePointer.bin); + Map> chunksBounding = dataSource.getFilePointersBounding(nextFilePointer.bin); return new IndexDelimitedLocusShard(nextFilePointer.locations,chunksBounding,Shard.ShardType.LOCUS_INTERVAL); } @@ -200,28 +100,4 @@ public class IndexDelimitedLocusShardStrategy implements ShardStrategy { public Iterator iterator() { return this; } - - /** - * Represents a small section of a BAM file, and every associated interval. - */ - private class FilePointer { - private final Bin bin; - private final List locations; - - public FilePointer(Bin bin) { - this.bin = bin; - this.locations = new ArrayList(); - } - - public FilePointer(GenomeLoc location) { - bin = null; - locations = Collections.singletonList(location); - } - - public void addLocation(GenomeLoc location) { - locations.add(location); - } - - } - } \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/shards/IntervalSharder.java b/java/src/org/broadinstitute/sting/gatk/datasources/shards/IntervalSharder.java new file mode 100644 index 000000000..76fbdec92 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/datasources/shards/IntervalSharder.java @@ -0,0 +1,142 @@ +package org.broadinstitute.sting.gatk.datasources.shards; + +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.gatk.datasources.simpleDataSources.BlockDrivenSAMDataSource; + +import java.util.List; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Iterator; + +import net.sf.samtools.Bin; + +/** + * Shard intervals based on position within the BAM file. + * + * @author mhanna + * @version 0.1 + */ +public class IntervalSharder { + protected static List shardIntervals(final BlockDrivenSAMDataSource dataSource, final List loci, final int binsDeeperThan) { + // Gather bins for the given loci, splitting loci as necessary so that each falls into exactly one lowest-level bin. + List filePointers = new ArrayList(); + FilePointer filePointer = null; + + for(GenomeLoc location: loci) { + int locationStart = (int)location.getStart(); + final int locationStop = (int)location.getStop(); + + List bins = findBinsAtLeastAsDeepAs(dataSource,dataSource.getOverlappingBins(location),binsDeeperThan); + + // Recursive stopping condition -- algorithm is at the zero point and no bins have been found. + if(binsDeeperThan == 0 && bins.size() == 0) { + filePointers.add(new FilePointer(location)); + continue; + } + + // No bins found; step up a level and search again. + if(bins.size() == 0) { + if(filePointer != null && filePointer.locations.size() > 0) { + filePointers.add(filePointer); + filePointer = null; + } + + filePointers.addAll(shardIntervals(dataSource,Collections.singletonList(location),binsDeeperThan-1)); + continue; + } + + // Bins found; try to match bins with locations. + Collections.sort(bins); + Iterator binIterator = bins.iterator(); + + while(locationStop >= locationStart) { + int binStart = filePointer!=null ? dataSource.getFirstLocusInBin(filePointer.bin) : 0; + int binStop = filePointer!=null ? dataSource.getLastLocusInBin(filePointer.bin) : 0; + + while(binStop < locationStart && binIterator.hasNext()) { + if(filePointer != null && filePointer.locations.size() > 0) + filePointers.add(filePointer); + + filePointer = new FilePointer(binIterator.next()); + binStart = dataSource.getFirstLocusInBin(filePointer.bin); + binStop = dataSource.getLastLocusInBin(filePointer.bin); + } + + if(locationStart < binStart) { + // The region starts before the first bin in the sequence. Add the region occurring before the sequence. + if(filePointer != null && filePointer.locations.size() > 0) { + filePointers.add(filePointer); + filePointer = null; + } + + final int regionStop = Math.min(locationStop,binStart-1); + + GenomeLoc subset = GenomeLocParser.createGenomeLoc(location.getContig(),locationStart,regionStop); + filePointers.addAll(shardIntervals(dataSource,Collections.singletonList(subset),binsDeeperThan-1)); + + locationStart = regionStop + 1; + } + else if(locationStart > binStop) { + // The region starts after the last bin in the sequence. Add the region occurring after the sequence. + if(filePointer != null && filePointer.locations.size() > 0) { + filePointers.add(filePointer); + filePointer = null; + } + + GenomeLoc subset = GenomeLocParser.createGenomeLoc(location.getContig(),locationStart,locationStop); + filePointers.addAll(shardIntervals(dataSource,Collections.singletonList(subset),binsDeeperThan-1)); + + locationStart = locationStop + 1; + } + else { + // The start of the region overlaps the bin. Add the overlapping subset. + final int regionStop = Math.min(locationStop,binStop); + filePointer.addLocation(GenomeLocParser.createGenomeLoc(location.getContig(), + locationStart, + regionStop)); + locationStart = regionStop + 1; + } + } + } + + if(filePointer != null && filePointer.locations.size() > 0) + filePointers.add(filePointer); + + return filePointers; + } + + private static List findBinsAtLeastAsDeepAs(final BlockDrivenSAMDataSource dataSource, final List bins, final int deepestBinLevel) { + List deepestBins = new ArrayList(); + for(Bin bin: bins) { + if(dataSource.getLevelForBin(bin) >= deepestBinLevel) + deepestBins.add(bin); + } + return deepestBins; + } +} + +/** + * Represents a small section of a BAM file, and every associated interval. + */ +class FilePointer { + protected final Bin bin; + protected final List locations; + + public FilePointer(Bin bin) { + this.bin = bin; + this.locations = new ArrayList(); + } + + public FilePointer(GenomeLoc location) { + bin = null; + locations = Collections.singletonList(location); + } + + public void addLocation(GenomeLoc location) { + locations.add(location); + } + +} + + diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/shards/ShardStrategyFactory.java b/java/src/org/broadinstitute/sting/gatk/datasources/shards/ShardStrategyFactory.java index 8bb024160..a084d01c6 100644 --- a/java/src/org/broadinstitute/sting/gatk/datasources/shards/ShardStrategyFactory.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/shards/ShardStrategyFactory.java @@ -78,7 +78,7 @@ public class ShardStrategyFactory { case LOCUS_EXPERIMENTAL: throw new UnsupportedOperationException("Cannot do experimental locus sharding without intervals"); case READS_EXPERIMENTAL: - return new BlockDelimitedReadShardStrategy(dataSource); + return new BlockDelimitedReadShardStrategy(dataSource,null); default: throw new StingException("Strategy: " + strat + " isn't implemented for this type of shatter request"); } @@ -118,7 +118,7 @@ public class ShardStrategyFactory { case LOCUS_EXPERIMENTAL: return new IndexDelimitedLocusShardStrategy(dataSource,lst); case READS_EXPERIMENTAL: - throw new UnsupportedOperationException("Cannot do experimental read sharding with intervals"); + return new BlockDelimitedReadShardStrategy(dataSource,lst); default: throw new StingException("Strategy: " + strat + " isn't implemented"); } diff --git a/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java b/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java index 4957b914d..fe96a0bd6 100644 --- a/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java +++ b/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java @@ -3,7 +3,6 @@ package org.broadinstitute.sting.gatk.executive; import org.broadinstitute.sting.gatk.datasources.providers.ShardDataProvider; import org.broadinstitute.sting.gatk.datasources.shards.Shard; import org.broadinstitute.sting.gatk.datasources.shards.ShardStrategy; -import org.broadinstitute.sting.gatk.datasources.shards.ReadShard; import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource; import org.broadinstitute.sting.gatk.datasources.simpleDataSources.SAMDataSource; import org.broadinstitute.sting.gatk.walkers.Walker; @@ -11,7 +10,6 @@ import org.broadinstitute.sting.gatk.io.DirectOutputTracker; import org.broadinstitute.sting.gatk.io.OutputTracker; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile; -import org.broadinstitute.sting.utils.GenomeLocSortedSet; import java.util.Collection;