Basic functionality for intervaled reads in new sharding system. Not currently filtering out cruft, so
the mode of operation is currently queryOverlapping rather than queryContained. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2899 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
cfff486338
commit
30eb28886b
|
|
@ -1,95 +0,0 @@
|
|||
package net.sf.samtools;
|
||||
|
||||
import net.sf.samtools.util.CloseableIterator;
|
||||
import net.sf.picard.PicardException;
|
||||
|
||||
import java.util.NoSuchElementException;
|
||||
import java.io.IOException;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.File;
|
||||
|
||||
/**
|
||||
* Walks over a BAM file, discovering and returning the starting location of each block
|
||||
* in chunk format.
|
||||
*
|
||||
* @author mhanna
|
||||
* @version 0.1
|
||||
*/
|
||||
public class BAMBlockIterator implements CloseableIterator<Block> {
|
||||
/**
|
||||
* The input stream for back files.
|
||||
*/
|
||||
private FileInputStream inputStream;
|
||||
|
||||
/**
|
||||
* File channel from which to read chunks.
|
||||
*/
|
||||
private BlockReader blockReader;
|
||||
|
||||
/**
|
||||
* What is the current position of this block within the BAM file?
|
||||
*/
|
||||
private long position = 0;
|
||||
|
||||
/**
|
||||
* Iterate through the BAM chunks in a file.
|
||||
* @param file stream File to use when accessing the BAM.
|
||||
* @throws IOException in case of problems reading from the file.
|
||||
*/
|
||||
public BAMBlockIterator(File file) throws IOException {
|
||||
inputStream = new FileInputStream(file);
|
||||
blockReader = new BlockReader(inputStream);
|
||||
}
|
||||
|
||||
/**
|
||||
* Close the existing file.
|
||||
*/
|
||||
public void close() {
|
||||
try {
|
||||
inputStream.close();
|
||||
}
|
||||
catch(IOException ex) {
|
||||
throw new PicardException("Unable to close file", ex);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Are there other chunks to retrieve in this file?
|
||||
* @return True if other chunks are available, false otherwise.
|
||||
*/
|
||||
public boolean hasNext() {
|
||||
try {
|
||||
return !blockReader.eof(position);
|
||||
} catch(IOException ex) {
|
||||
throw new SAMException("Unable to check file for a next BAM record", ex);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the next chunk from the iterator.
|
||||
* @return The next chunk.
|
||||
* @throws NoSuchElementException if no next chunk is available.
|
||||
*/
|
||||
public Block next() {
|
||||
if(!hasNext())
|
||||
throw new NoSuchElementException("No next chunk is available.");
|
||||
|
||||
Block block = null;
|
||||
try {
|
||||
block = blockReader.getBlockAt(position);
|
||||
position = block.position + block.compressedBlockSize;
|
||||
}
|
||||
catch(IOException ex) {
|
||||
throw new SAMException("Unable to completely read chunk at end of file.", ex);
|
||||
}
|
||||
return block;
|
||||
}
|
||||
|
||||
/**
|
||||
* Remove a chunk from the iterator.
|
||||
* @throws UnsupportedOperationException always.
|
||||
*/
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException("BAMChunkIterator: Cannot remove a BAM chunk.");
|
||||
}
|
||||
}
|
||||
|
|
@ -1,88 +0,0 @@
|
|||
package net.sf.samtools;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Adapts a list of BAM blocks to BAM chunks.
|
||||
*
|
||||
* @author mhanna
|
||||
* @version 0.1
|
||||
*/
|
||||
public class BAMChunkIterator implements Iterator<Chunk> {
|
||||
/**
|
||||
* The wrapped chunk iterator.
|
||||
*/
|
||||
private final BAMBlockIterator blockIterator;
|
||||
|
||||
/**
|
||||
* List of prefetched segments. If prefetchedSegments is
|
||||
* empty, this should mean that there's nothing left in the queue.
|
||||
*/
|
||||
private Queue<BlockSegment> prefetchedSegments;
|
||||
|
||||
/**
|
||||
* List of chunks to filter out of the final chunk list.
|
||||
*/
|
||||
private final Queue<Chunk> filters;
|
||||
|
||||
/**
|
||||
* Creates a new chunk iterator wrapping the given block, optionally filtering
|
||||
* out a list of chunks from those blocks.
|
||||
* TODO: This class is too smart. Wrap filtering functionality in another class.
|
||||
* @param blockIterator block iterator to wrap.
|
||||
* @param filters List of chunks to filter out of the given input stream.
|
||||
*/
|
||||
public BAMChunkIterator(BAMBlockIterator blockIterator, List<Chunk> filters) {
|
||||
this.blockIterator = blockIterator;
|
||||
this.prefetchedSegments = new LinkedList<BlockSegment>();
|
||||
this.filters = new PriorityQueue<Chunk>(filters);
|
||||
seedNextSegments();
|
||||
}
|
||||
|
||||
/**
|
||||
* Is a next chunk available.
|
||||
* @return true if a next chunk is available; false otherwise.
|
||||
*/
|
||||
public boolean hasNext() {
|
||||
return !prefetchedSegments.isEmpty();
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the next chunk in the list.
|
||||
* @return Next block, adapted to a chunk.
|
||||
*/
|
||||
public Chunk next() {
|
||||
if(prefetchedSegments.isEmpty())
|
||||
throw new NoSuchElementException("BAMChunkIterator is exhausted.");
|
||||
// TODO: Maybe tie together adjacent chunks together?
|
||||
Chunk nextChunk = prefetchedSegments.remove().toChunk();
|
||||
seedNextSegments();
|
||||
return nextChunk;
|
||||
}
|
||||
|
||||
/**
|
||||
* @throws UnsupportedOperationException always.
|
||||
*/
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException("Cannot remove from a BAM chunk iterator");
|
||||
}
|
||||
|
||||
/**
|
||||
* Seed the next value or set of values for the iterator, if necessary.
|
||||
*/
|
||||
private void seedNextSegments() {
|
||||
while(prefetchedSegments.isEmpty() && blockIterator.hasNext()) {
|
||||
// Add the new segments to the prefetch...
|
||||
prefetchedSegments.add(new BlockSegment(blockIterator.next()));
|
||||
|
||||
// And iteratively subtract the filtering chunks.
|
||||
for(Chunk filter: filters) {
|
||||
Queue<BlockSegment> filteredBlockSegments = new LinkedList<BlockSegment>();
|
||||
for(BlockSegment blockSegment: prefetchedSegments)
|
||||
filteredBlockSegments.addAll(blockSegment.minus(filter));
|
||||
prefetchedSegments = filteredBlockSegments;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -1,127 +0,0 @@
|
|||
package net.sf.samtools;
|
||||
|
||||
import net.sf.samtools.util.BlockCompressedInputStream;
|
||||
import net.sf.samtools.util.BinaryCodec;
|
||||
import net.sf.samtools.util.StringLineReader;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.DataInputStream;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.ArrayList;
|
||||
|
||||
/**
|
||||
* Loads a BAM file header from an file, optionally providing its position
|
||||
* within the file.
|
||||
*
|
||||
* @author mhanna
|
||||
* @version 0.1
|
||||
*/
|
||||
public class BAMFileHeaderLoader {
|
||||
/**
|
||||
* The contents of the BAM file header.
|
||||
*/
|
||||
private final SAMFileHeader header;
|
||||
|
||||
/**
|
||||
* Location of the header within the BAM file.
|
||||
*/
|
||||
private final Chunk location;
|
||||
|
||||
public static final Chunk preambleLocation = new Chunk(0<<16 | 0, 0<<16 | 3);
|
||||
|
||||
/**
|
||||
* Load the header from the given file.
|
||||
* @param header the parsed haeder for the BAM file.
|
||||
* @param location the location of the header (start and stop) within the BAM.
|
||||
*/
|
||||
private BAMFileHeaderLoader(SAMFileHeader header, Chunk location) {
|
||||
this.header = header;
|
||||
this.location = location;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the header for the given BAM file.
|
||||
* @return The header for this BAM file.
|
||||
*/
|
||||
public SAMFileHeader getHeader() {
|
||||
return header;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the location of the header within the given BAM file, in chunk format.
|
||||
* @return the location of the header, in chunk coordinates.
|
||||
*/
|
||||
public Chunk getLocation() {
|
||||
return location;
|
||||
}
|
||||
|
||||
public static BAMFileHeaderLoader load(File file) throws IOException {
|
||||
BlockCompressedInputStream inputStream = new BlockCompressedInputStream(file);
|
||||
BinaryCodec binaryCodec = new BinaryCodec(new DataInputStream(inputStream));
|
||||
|
||||
final byte[] buffer = new byte[4];
|
||||
binaryCodec.readBytes(buffer);
|
||||
if (!Arrays.equals(buffer, BAMFileConstants.BAM_MAGIC)) {
|
||||
throw new IOException("Invalid BAM file header");
|
||||
}
|
||||
|
||||
final int headerTextLength = binaryCodec.readInt();
|
||||
final String textHeader = binaryCodec.readString(headerTextLength);
|
||||
final SAMTextHeaderCodec headerCodec = new SAMTextHeaderCodec();
|
||||
headerCodec.setValidationStringency(SAMFileReader.ValidationStringency.SILENT);
|
||||
SAMFileHeader header = headerCodec.decode(new StringLineReader(textHeader),file.getAbsolutePath());
|
||||
|
||||
// directly copied from BAMFileReader...
|
||||
final int sequenceCount = binaryCodec.readInt();
|
||||
if (header.getSequenceDictionary().size() > 0) {
|
||||
// It is allowed to have binary sequences but no text sequences, so only validate if both are present
|
||||
if (sequenceCount != header.getSequenceDictionary().size()) {
|
||||
throw new SAMFormatException("Number of sequences in text header (" +
|
||||
header.getSequenceDictionary().size() +
|
||||
") != number of sequences in binary header (" + sequenceCount + ") for file " + file);
|
||||
}
|
||||
for (int i = 0; i < sequenceCount; i++) {
|
||||
final SAMSequenceRecord binarySequenceRecord = readSequenceRecord(binaryCodec,file);
|
||||
final SAMSequenceRecord sequenceRecord = header.getSequence(i);
|
||||
if (!sequenceRecord.getSequenceName().equals(binarySequenceRecord.getSequenceName())) {
|
||||
throw new SAMFormatException("For sequence " + i + ", text and binary have different names in file " +
|
||||
binaryCodec);
|
||||
}
|
||||
if (sequenceRecord.getSequenceLength() != binarySequenceRecord.getSequenceLength()) {
|
||||
throw new SAMFormatException("For sequence " + i + ", text and binary have different lengths in file " +
|
||||
binaryCodec);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// If only binary sequences are present, copy them into mFileHeader
|
||||
final List<SAMSequenceRecord> sequences = new ArrayList<SAMSequenceRecord>(sequenceCount);
|
||||
for (int i = 0; i < sequenceCount; i++) {
|
||||
sequences.add(readSequenceRecord(binaryCodec,file));
|
||||
}
|
||||
header.setSequenceDictionary(new SAMSequenceDictionary(sequences));
|
||||
}
|
||||
inputStream.close();
|
||||
|
||||
return new BAMFileHeaderLoader(header,new Chunk(buffer.length,inputStream.getFilePointer()-1));
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads a single binary sequence record from the file or stream
|
||||
* @param binaryCodec stream to read from.
|
||||
* @param file Note that this is used only for reporting errors.
|
||||
* @return an individual sequence record.
|
||||
*/
|
||||
private static SAMSequenceRecord readSequenceRecord(final BinaryCodec binaryCodec, final File file) {
|
||||
final int nameLength = binaryCodec.readInt();
|
||||
if (nameLength <= 1) {
|
||||
throw new SAMFormatException("Invalid BAM file header: missing sequence name in file " + file.getAbsolutePath());
|
||||
}
|
||||
final String sequenceName = binaryCodec.readString(nameLength - 1);
|
||||
// Skip the null terminator
|
||||
binaryCodec.readByte();
|
||||
final int sequenceLength = binaryCodec.readInt();
|
||||
return new SAMSequenceRecord(sequenceName, sequenceLength);
|
||||
}
|
||||
}
|
||||
|
|
@ -1,56 +0,0 @@
|
|||
package net.sf.samtools;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.util.Collections;
|
||||
|
||||
/**
|
||||
* Test harness for playing with sharding by BGZF block.
|
||||
*
|
||||
* @author mhanna
|
||||
* @version 0.1
|
||||
*/
|
||||
public class BlockTestHarness {
|
||||
private static void usage() {
|
||||
System.out.println("Usage: ChunkTestHarness <filename>.bam");
|
||||
System.exit(1);
|
||||
}
|
||||
|
||||
public static void main(String args[]) throws IOException {
|
||||
if(args.length == 0)
|
||||
usage();
|
||||
|
||||
String bamFileName = args[0];
|
||||
if(!bamFileName.endsWith(".bam"))
|
||||
usage();
|
||||
|
||||
File bamFile = new File(bamFileName);
|
||||
if(!bamFile.exists())
|
||||
usage();
|
||||
|
||||
Chunk headerLocation = BAMFileHeaderLoader.load(bamFile).getLocation();
|
||||
System.out.printf("Header location = %s%n", headerLocation);
|
||||
|
||||
BAMChunkIterator chunkIterator = new BAMChunkIterator(new BAMBlockIterator(bamFile),
|
||||
Collections.singletonList(headerLocation));
|
||||
while(chunkIterator.hasNext()) {
|
||||
Chunk chunk = chunkIterator.next();
|
||||
}
|
||||
|
||||
// The following test code blocks a bunch of files together.
|
||||
/*
|
||||
BAMBlockIterator blockIterator = new BAMBlockIterator(bamFile);
|
||||
long blockCount = 0;
|
||||
|
||||
long startTime = System.currentTimeMillis();
|
||||
while(blockIterator.hasNext()) {
|
||||
Block block = blockIterator.next();
|
||||
blockCount++;
|
||||
System.out.println(block);
|
||||
}
|
||||
long endTime = System.currentTimeMillis();
|
||||
|
||||
System.out.printf("Number of chunks: %d; elapsed time: %dms%n", blockCount, endTime-startTime);
|
||||
*/
|
||||
}
|
||||
}
|
||||
|
|
@ -30,7 +30,7 @@ public class Chunk implements Cloneable,Comparable<Chunk> {
|
|||
return mChunkStart;
|
||||
}
|
||||
|
||||
void setChunkStart(final long value) {
|
||||
public void setChunkStart(final long value) {
|
||||
mChunkStart = value;
|
||||
}
|
||||
|
||||
|
|
@ -38,7 +38,7 @@ public class Chunk implements Cloneable,Comparable<Chunk> {
|
|||
return mChunkEnd;
|
||||
}
|
||||
|
||||
void setChunkEnd(final long value) {
|
||||
public void setChunkEnd(final long value) {
|
||||
mChunkEnd = value;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -47,8 +47,7 @@ public class BAMFileStat extends CommandLineProgram {
|
|||
|
||||
switch(command) {
|
||||
case ShowBlocks:
|
||||
showBlocks(new File(bamFileName),startPosition,stopPosition);
|
||||
break;
|
||||
throw new StingException("The BAM block inspector has been disabled.");
|
||||
case ShowIndex:
|
||||
showIndexBins(new File(bamFileName+".bai"));
|
||||
break;
|
||||
|
|
@ -70,26 +69,6 @@ public class BAMFileStat extends CommandLineProgram {
|
|||
}
|
||||
}
|
||||
|
||||
private void showBlocks(File bamFile, Integer startPosition, Integer stopPosition) {
|
||||
int blockNumber = 0;
|
||||
|
||||
try {
|
||||
BAMBlockIterator iterator = new BAMBlockIterator(bamFile);
|
||||
while(iterator.hasNext()) {
|
||||
Block block = iterator.next();
|
||||
blockNumber++;
|
||||
|
||||
if(startPosition != null && startPosition > blockNumber) continue;
|
||||
if(stopPosition != null && stopPosition < blockNumber) break;
|
||||
|
||||
System.out.printf("Block number = %d; position = %d; compressed size = %d; uncompressed size = %d%n", blockNumber, block.position, block.compressedBlockSize, block.uncompressedBlockSize);
|
||||
}
|
||||
}
|
||||
catch(IOException ex) {
|
||||
throw new StingException("Unable to open BAM file");
|
||||
}
|
||||
}
|
||||
|
||||
private void showIndexBins(File bamFile) {
|
||||
BAMFileIndexContentInspector inspector = new BAMFileIndexContentInspector(bamFile);
|
||||
inspector.inspect(System.out,null,null);
|
||||
|
|
|
|||
|
|
@ -34,9 +34,16 @@ public class BlockDelimitedReadShard extends ReadShard implements BAMFormatAware
|
|||
*/
|
||||
private final Collection<SAMRecord> reads = new ArrayList<SAMRecord>(BlockDelimitedReadShardStrategy.MAX_READS);
|
||||
|
||||
public BlockDelimitedReadShard(Reads sourceInfo, Map<SAMFileReader2,List<Chunk>> chunks) {
|
||||
/**
|
||||
* An BlockDelimitedLocusShard can be used either for READ or READ shard types.
|
||||
* Track which type is being used.
|
||||
*/
|
||||
private final Shard.ShardType shardType;
|
||||
|
||||
public BlockDelimitedReadShard(Reads sourceInfo, Map<SAMFileReader2,List<Chunk>> chunks, Shard.ShardType shardType) {
|
||||
this.sourceInfo = sourceInfo;
|
||||
this.chunks = chunks;
|
||||
this.shardType = shardType;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -5,6 +5,7 @@ import net.sf.samtools.*;
|
|||
import java.util.*;
|
||||
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
|
||||
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.SAMDataSource;
|
||||
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.BlockDrivenSAMDataSource;
|
||||
|
||||
|
|
@ -26,6 +27,9 @@ public class BlockDelimitedReadShardStrategy extends ReadShardStrategy {
|
|||
*/
|
||||
protected final BlockDrivenSAMDataSource dataSource;
|
||||
|
||||
/** our storage of the genomic locations they'd like to shard over */
|
||||
private final List<FilePointer> filePointers = new ArrayList<FilePointer>();
|
||||
|
||||
/**
|
||||
* Position of the last shard in the file.
|
||||
*/
|
||||
|
|
@ -40,12 +44,14 @@ public class BlockDelimitedReadShardStrategy extends ReadShardStrategy {
|
|||
* Create a new read shard strategy, loading read shards from the given BAM file.
|
||||
* @param dataSource Data source from which to load shards.
|
||||
*/
|
||||
public BlockDelimitedReadShardStrategy(SAMDataSource dataSource) {
|
||||
public BlockDelimitedReadShardStrategy(SAMDataSource dataSource, GenomeLocSortedSet locations) {
|
||||
if(!(dataSource instanceof BlockDrivenSAMDataSource))
|
||||
throw new StingException("Block-delimited read shard strategy cannot work without a block-driven data source.");
|
||||
|
||||
this.dataSource = (BlockDrivenSAMDataSource)dataSource;
|
||||
this.position = this.dataSource.getCurrentPosition();
|
||||
this.position = this.dataSource.getCurrentPosition();
|
||||
if(locations != null)
|
||||
filePointers.addAll(IntervalSharder.shardIntervals(this.dataSource,locations.toList(),this.dataSource.getNumIndexLevels()-1));
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -65,17 +71,41 @@ public class BlockDelimitedReadShardStrategy extends ReadShardStrategy {
|
|||
if(!hasNext())
|
||||
throw new NoSuchElementException("No such element available: SAM reader has arrived at last shard.");
|
||||
|
||||
// TODO: This level of processing should not be necessary.
|
||||
Map<SAMFileReader2,List<Chunk>> boundedPosition = new HashMap<SAMFileReader2,List<Chunk>>();
|
||||
for(Map.Entry<SAMFileReader2,Chunk> entry: position.entrySet())
|
||||
boundedPosition.put(entry.getKey(),Collections.singletonList(entry.getValue()));
|
||||
Map<SAMFileReader2,List<Chunk>> shardPosition = null;
|
||||
if(!filePointers.isEmpty()) {
|
||||
boolean foundData = false;
|
||||
for(FilePointer filePointer: filePointers) {
|
||||
shardPosition = dataSource.getFilePointersBounding(filePointer.bin);
|
||||
for(SAMFileReader2 reader: shardPosition.keySet()) {
|
||||
List<Chunk> chunks = shardPosition.get(reader);
|
||||
Chunk filePosition = position.get(reader);
|
||||
for(Chunk chunk: chunks) {
|
||||
if(filePosition.getChunkStart() > chunk.getChunkEnd())
|
||||
chunks.remove(chunk);
|
||||
else {
|
||||
if(filePosition.getChunkStart() > chunk.getChunkStart())
|
||||
chunk.setChunkStart(filePosition.getChunkStart());
|
||||
foundData = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
if(foundData)
|
||||
break;
|
||||
}
|
||||
}
|
||||
else {
|
||||
// TODO: This level of processing should not be necessary.
|
||||
shardPosition = new HashMap<SAMFileReader2,List<Chunk>>();
|
||||
for(Map.Entry<SAMFileReader2,Chunk> entry: position.entrySet())
|
||||
shardPosition.put(entry.getKey(),Collections.singletonList(entry.getValue()));
|
||||
}
|
||||
|
||||
BAMFormatAwareShard shard = new BlockDelimitedReadShard(dataSource.getReadsInfo(),boundedPosition);
|
||||
BAMFormatAwareShard shard = new BlockDelimitedReadShard(dataSource.getReadsInfo(),shardPosition,Shard.ShardType.READ);
|
||||
atEndOfStream = dataSource.fillShard(shard);
|
||||
|
||||
this.position = dataSource.getCurrentPosition();
|
||||
|
||||
return shard;
|
||||
return shard;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -1,16 +1,13 @@
|
|||
package org.broadinstitute.sting.gatk.datasources.shards;
|
||||
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.SAMDataSource;
|
||||
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.BlockDrivenSAMDataSource;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
import net.sf.samtools.Chunk;
|
||||
import net.sf.samtools.Bin;
|
||||
import net.sf.samtools.SAMFileReader2;
|
||||
|
||||
/*
|
||||
|
|
@ -46,7 +43,7 @@ public class IndexDelimitedLocusShardStrategy implements ShardStrategy {
|
|||
/**
|
||||
* The data source to use when performing this sharding.
|
||||
*/
|
||||
private final BlockDrivenSAMDataSource blockDrivenDataSource;
|
||||
private final BlockDrivenSAMDataSource dataSource;
|
||||
|
||||
/** our storage of the genomic locations they'd like to shard over */
|
||||
private final List<FilePointer> filePointers = new ArrayList<FilePointer>();
|
||||
|
|
@ -65,108 +62,11 @@ public class IndexDelimitedLocusShardStrategy implements ShardStrategy {
|
|||
if(!(dataSource instanceof BlockDrivenSAMDataSource))
|
||||
throw new StingException("Cannot power an IndexDelimitedLocusShardStrategy with this data source.");
|
||||
|
||||
blockDrivenDataSource = (BlockDrivenSAMDataSource)dataSource;
|
||||
filePointers.addAll(batchLociIntoBins(locations.toList(),blockDrivenDataSource.getNumIndexLevels()-1));
|
||||
this.dataSource = (BlockDrivenSAMDataSource)dataSource;
|
||||
filePointers.addAll(IntervalSharder.shardIntervals(this.dataSource,locations.toList(),this.dataSource.getNumIndexLevels()-1));
|
||||
filePointerIterator = filePointers.iterator();
|
||||
}
|
||||
|
||||
private List<FilePointer> batchLociIntoBins(final List<GenomeLoc> loci, final int binsDeeperThan) {
|
||||
// Gather bins for the given loci, splitting loci as necessary so that each falls into exactly one lowest-level bin.
|
||||
List<FilePointer> filePointers = new ArrayList<FilePointer>();
|
||||
FilePointer filePointer = null;
|
||||
|
||||
for(GenomeLoc location: loci) {
|
||||
int locationStart = (int)location.getStart();
|
||||
final int locationStop = (int)location.getStop();
|
||||
|
||||
List<Bin> bins = findBinsAtLeastAsDeepAs(blockDrivenDataSource.getOverlappingBins(location),binsDeeperThan);
|
||||
|
||||
// Recursive stopping condition -- algorithm is at the zero point and no bins have been found.
|
||||
if(binsDeeperThan == 0 && bins.size() == 0) {
|
||||
filePointers.add(new FilePointer(location));
|
||||
continue;
|
||||
}
|
||||
|
||||
// No bins found; step up a level and search again.
|
||||
if(bins.size() == 0) {
|
||||
if(filePointer != null && filePointer.locations.size() > 0) {
|
||||
filePointers.add(filePointer);
|
||||
filePointer = null;
|
||||
}
|
||||
|
||||
filePointers.addAll(batchLociIntoBins(Collections.singletonList(location),binsDeeperThan-1));
|
||||
continue;
|
||||
}
|
||||
|
||||
// Bins found; try to match bins with locations.
|
||||
Collections.sort(bins);
|
||||
Iterator<Bin> binIterator = bins.iterator();
|
||||
|
||||
while(locationStop >= locationStart) {
|
||||
int binStart = filePointer!=null ? blockDrivenDataSource.getFirstLocusInBin(filePointer.bin) : 0;
|
||||
int binStop = filePointer!=null ? blockDrivenDataSource.getLastLocusInBin(filePointer.bin) : 0;
|
||||
|
||||
while(binStop < locationStart && binIterator.hasNext()) {
|
||||
if(filePointer != null && filePointer.locations.size() > 0)
|
||||
filePointers.add(filePointer);
|
||||
|
||||
filePointer = new FilePointer(binIterator.next());
|
||||
binStart = blockDrivenDataSource.getFirstLocusInBin(filePointer.bin);
|
||||
binStop = blockDrivenDataSource.getLastLocusInBin(filePointer.bin);
|
||||
}
|
||||
|
||||
if(locationStart < binStart) {
|
||||
// The region starts before the first bin in the sequence. Add the region occurring before the sequence.
|
||||
if(filePointer != null && filePointer.locations.size() > 0) {
|
||||
filePointers.add(filePointer);
|
||||
filePointer = null;
|
||||
}
|
||||
|
||||
final int regionStop = Math.min(locationStop,binStart-1);
|
||||
|
||||
GenomeLoc subset = GenomeLocParser.createGenomeLoc(location.getContig(),locationStart,regionStop);
|
||||
filePointers.addAll(batchLociIntoBins(Collections.singletonList(subset),binsDeeperThan-1));
|
||||
|
||||
locationStart = regionStop + 1;
|
||||
}
|
||||
else if(locationStart > binStop) {
|
||||
// The region starts after the last bin in the sequence. Add the region occurring after the sequence.
|
||||
if(filePointer != null && filePointer.locations.size() > 0) {
|
||||
filePointers.add(filePointer);
|
||||
filePointer = null;
|
||||
}
|
||||
|
||||
GenomeLoc subset = GenomeLocParser.createGenomeLoc(location.getContig(),locationStart,locationStop);
|
||||
filePointers.addAll(batchLociIntoBins(Collections.singletonList(subset),binsDeeperThan-1));
|
||||
|
||||
locationStart = locationStop + 1;
|
||||
}
|
||||
else {
|
||||
// The start of the region overlaps the bin. Add the overlapping subset.
|
||||
final int regionStop = Math.min(locationStop,binStop);
|
||||
filePointer.addLocation(GenomeLocParser.createGenomeLoc(location.getContig(),
|
||||
locationStart,
|
||||
regionStop));
|
||||
locationStart = regionStop + 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if(filePointer != null && filePointer.locations.size() > 0)
|
||||
filePointers.add(filePointer);
|
||||
|
||||
return filePointers;
|
||||
}
|
||||
|
||||
private List<Bin> findBinsAtLeastAsDeepAs(final List<Bin> bins, final int deepestBinLevel) {
|
||||
List<Bin> deepestBins = new ArrayList<Bin>();
|
||||
for(Bin bin: bins) {
|
||||
if(blockDrivenDataSource.getLevelForBin(bin) >= deepestBinLevel)
|
||||
deepestBins.add(bin);
|
||||
}
|
||||
return deepestBins;
|
||||
}
|
||||
|
||||
/**
|
||||
* returns true if there are additional shards
|
||||
*
|
||||
|
|
@ -183,7 +83,7 @@ public class IndexDelimitedLocusShardStrategy implements ShardStrategy {
|
|||
*/
|
||||
public IndexDelimitedLocusShard next() {
|
||||
FilePointer nextFilePointer = filePointerIterator.next();
|
||||
Map<SAMFileReader2,List<Chunk>> chunksBounding = blockDrivenDataSource.getFilePointersBounding(nextFilePointer.bin);
|
||||
Map<SAMFileReader2,List<Chunk>> chunksBounding = dataSource.getFilePointersBounding(nextFilePointer.bin);
|
||||
return new IndexDelimitedLocusShard(nextFilePointer.locations,chunksBounding,Shard.ShardType.LOCUS_INTERVAL);
|
||||
}
|
||||
|
||||
|
|
@ -200,28 +100,4 @@ public class IndexDelimitedLocusShardStrategy implements ShardStrategy {
|
|||
public Iterator<Shard> iterator() {
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Represents a small section of a BAM file, and every associated interval.
|
||||
*/
|
||||
private class FilePointer {
|
||||
private final Bin bin;
|
||||
private final List<GenomeLoc> locations;
|
||||
|
||||
public FilePointer(Bin bin) {
|
||||
this.bin = bin;
|
||||
this.locations = new ArrayList<GenomeLoc>();
|
||||
}
|
||||
|
||||
public FilePointer(GenomeLoc location) {
|
||||
bin = null;
|
||||
locations = Collections.singletonList(location);
|
||||
}
|
||||
|
||||
public void addLocation(GenomeLoc location) {
|
||||
locations.add(location);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,142 @@
|
|||
package org.broadinstitute.sting.gatk.datasources.shards;
|
||||
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.BlockDrivenSAMDataSource;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.Iterator;
|
||||
|
||||
import net.sf.samtools.Bin;
|
||||
|
||||
/**
|
||||
* Shard intervals based on position within the BAM file.
|
||||
*
|
||||
* @author mhanna
|
||||
* @version 0.1
|
||||
*/
|
||||
public class IntervalSharder {
|
||||
protected static List<FilePointer> shardIntervals(final BlockDrivenSAMDataSource dataSource, final List<GenomeLoc> loci, final int binsDeeperThan) {
|
||||
// Gather bins for the given loci, splitting loci as necessary so that each falls into exactly one lowest-level bin.
|
||||
List<FilePointer> filePointers = new ArrayList<FilePointer>();
|
||||
FilePointer filePointer = null;
|
||||
|
||||
for(GenomeLoc location: loci) {
|
||||
int locationStart = (int)location.getStart();
|
||||
final int locationStop = (int)location.getStop();
|
||||
|
||||
List<Bin> bins = findBinsAtLeastAsDeepAs(dataSource,dataSource.getOverlappingBins(location),binsDeeperThan);
|
||||
|
||||
// Recursive stopping condition -- algorithm is at the zero point and no bins have been found.
|
||||
if(binsDeeperThan == 0 && bins.size() == 0) {
|
||||
filePointers.add(new FilePointer(location));
|
||||
continue;
|
||||
}
|
||||
|
||||
// No bins found; step up a level and search again.
|
||||
if(bins.size() == 0) {
|
||||
if(filePointer != null && filePointer.locations.size() > 0) {
|
||||
filePointers.add(filePointer);
|
||||
filePointer = null;
|
||||
}
|
||||
|
||||
filePointers.addAll(shardIntervals(dataSource,Collections.singletonList(location),binsDeeperThan-1));
|
||||
continue;
|
||||
}
|
||||
|
||||
// Bins found; try to match bins with locations.
|
||||
Collections.sort(bins);
|
||||
Iterator<Bin> binIterator = bins.iterator();
|
||||
|
||||
while(locationStop >= locationStart) {
|
||||
int binStart = filePointer!=null ? dataSource.getFirstLocusInBin(filePointer.bin) : 0;
|
||||
int binStop = filePointer!=null ? dataSource.getLastLocusInBin(filePointer.bin) : 0;
|
||||
|
||||
while(binStop < locationStart && binIterator.hasNext()) {
|
||||
if(filePointer != null && filePointer.locations.size() > 0)
|
||||
filePointers.add(filePointer);
|
||||
|
||||
filePointer = new FilePointer(binIterator.next());
|
||||
binStart = dataSource.getFirstLocusInBin(filePointer.bin);
|
||||
binStop = dataSource.getLastLocusInBin(filePointer.bin);
|
||||
}
|
||||
|
||||
if(locationStart < binStart) {
|
||||
// The region starts before the first bin in the sequence. Add the region occurring before the sequence.
|
||||
if(filePointer != null && filePointer.locations.size() > 0) {
|
||||
filePointers.add(filePointer);
|
||||
filePointer = null;
|
||||
}
|
||||
|
||||
final int regionStop = Math.min(locationStop,binStart-1);
|
||||
|
||||
GenomeLoc subset = GenomeLocParser.createGenomeLoc(location.getContig(),locationStart,regionStop);
|
||||
filePointers.addAll(shardIntervals(dataSource,Collections.singletonList(subset),binsDeeperThan-1));
|
||||
|
||||
locationStart = regionStop + 1;
|
||||
}
|
||||
else if(locationStart > binStop) {
|
||||
// The region starts after the last bin in the sequence. Add the region occurring after the sequence.
|
||||
if(filePointer != null && filePointer.locations.size() > 0) {
|
||||
filePointers.add(filePointer);
|
||||
filePointer = null;
|
||||
}
|
||||
|
||||
GenomeLoc subset = GenomeLocParser.createGenomeLoc(location.getContig(),locationStart,locationStop);
|
||||
filePointers.addAll(shardIntervals(dataSource,Collections.singletonList(subset),binsDeeperThan-1));
|
||||
|
||||
locationStart = locationStop + 1;
|
||||
}
|
||||
else {
|
||||
// The start of the region overlaps the bin. Add the overlapping subset.
|
||||
final int regionStop = Math.min(locationStop,binStop);
|
||||
filePointer.addLocation(GenomeLocParser.createGenomeLoc(location.getContig(),
|
||||
locationStart,
|
||||
regionStop));
|
||||
locationStart = regionStop + 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if(filePointer != null && filePointer.locations.size() > 0)
|
||||
filePointers.add(filePointer);
|
||||
|
||||
return filePointers;
|
||||
}
|
||||
|
||||
private static List<Bin> findBinsAtLeastAsDeepAs(final BlockDrivenSAMDataSource dataSource, final List<Bin> bins, final int deepestBinLevel) {
|
||||
List<Bin> deepestBins = new ArrayList<Bin>();
|
||||
for(Bin bin: bins) {
|
||||
if(dataSource.getLevelForBin(bin) >= deepestBinLevel)
|
||||
deepestBins.add(bin);
|
||||
}
|
||||
return deepestBins;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Represents a small section of a BAM file, and every associated interval.
|
||||
*/
|
||||
class FilePointer {
|
||||
protected final Bin bin;
|
||||
protected final List<GenomeLoc> locations;
|
||||
|
||||
public FilePointer(Bin bin) {
|
||||
this.bin = bin;
|
||||
this.locations = new ArrayList<GenomeLoc>();
|
||||
}
|
||||
|
||||
public FilePointer(GenomeLoc location) {
|
||||
bin = null;
|
||||
locations = Collections.singletonList(location);
|
||||
}
|
||||
|
||||
public void addLocation(GenomeLoc location) {
|
||||
locations.add(location);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -78,7 +78,7 @@ public class ShardStrategyFactory {
|
|||
case LOCUS_EXPERIMENTAL:
|
||||
throw new UnsupportedOperationException("Cannot do experimental locus sharding without intervals");
|
||||
case READS_EXPERIMENTAL:
|
||||
return new BlockDelimitedReadShardStrategy(dataSource);
|
||||
return new BlockDelimitedReadShardStrategy(dataSource,null);
|
||||
default:
|
||||
throw new StingException("Strategy: " + strat + " isn't implemented for this type of shatter request");
|
||||
}
|
||||
|
|
@ -118,7 +118,7 @@ public class ShardStrategyFactory {
|
|||
case LOCUS_EXPERIMENTAL:
|
||||
return new IndexDelimitedLocusShardStrategy(dataSource,lst);
|
||||
case READS_EXPERIMENTAL:
|
||||
throw new UnsupportedOperationException("Cannot do experimental read sharding with intervals");
|
||||
return new BlockDelimitedReadShardStrategy(dataSource,lst);
|
||||
default:
|
||||
throw new StingException("Strategy: " + strat + " isn't implemented");
|
||||
}
|
||||
|
|
|
|||
|
|
@ -3,7 +3,6 @@ package org.broadinstitute.sting.gatk.executive;
|
|||
import org.broadinstitute.sting.gatk.datasources.providers.ShardDataProvider;
|
||||
import org.broadinstitute.sting.gatk.datasources.shards.Shard;
|
||||
import org.broadinstitute.sting.gatk.datasources.shards.ShardStrategy;
|
||||
import org.broadinstitute.sting.gatk.datasources.shards.ReadShard;
|
||||
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource;
|
||||
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.SAMDataSource;
|
||||
import org.broadinstitute.sting.gatk.walkers.Walker;
|
||||
|
|
@ -11,7 +10,6 @@ import org.broadinstitute.sting.gatk.io.DirectOutputTracker;
|
|||
import org.broadinstitute.sting.gatk.io.OutputTracker;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile;
|
||||
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
|
||||
|
||||
import java.util.Collection;
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue