Cleanup GATKBAMIndex unit test to allow a more efficient access pattern for
FindLargeShards. Runtime of FindLargeShards on papuan dataset is now 75min. GATK proper should benefit as well, although the benefits might be so small as to not be measurable. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5798 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
9e08a699c6
commit
03452c15c0
|
|
@ -91,7 +91,7 @@ public class BAMSchedule implements CloseableIterator<BAMScheduleEntry> {
|
|||
* @param indexFiles Index files.
|
||||
* @param intervals List of
|
||||
*/
|
||||
public BAMSchedule(final Map<SAMReaderID,File> indexFiles, final List<GenomeLoc> intervals) {
|
||||
public BAMSchedule(final Map<SAMReaderID,GATKBAMIndex> indexFiles, final List<GenomeLoc> intervals) {
|
||||
if(intervals.isEmpty())
|
||||
throw new ReviewedStingException("Tried to write schedule for empty interval list.");
|
||||
|
||||
|
|
@ -102,7 +102,8 @@ public class BAMSchedule implements CloseableIterator<BAMScheduleEntry> {
|
|||
readerIDs.addAll(indexFiles.keySet());
|
||||
|
||||
for(final SAMReaderID reader: readerIDs) {
|
||||
GATKBAMIndex index = new GATKBAMIndex(indexFiles.get(reader),referenceSequence);
|
||||
final GATKBAMIndex index = indexFiles.get(reader);
|
||||
final GATKBAMIndexData indexData = index.readReferenceSequence(referenceSequence);
|
||||
|
||||
int currentBinInLowestLevel = GATKBAMIndex.getFirstBinInLevel(GATKBAMIndex.getNumIndexLevels()-1);
|
||||
Iterator<GenomeLoc> locusIterator = intervals.iterator();
|
||||
|
|
@ -132,7 +133,7 @@ public class BAMSchedule implements CloseableIterator<BAMScheduleEntry> {
|
|||
|
||||
// Code at this point knows that the current bin is neither before nor after the current locus,
|
||||
// so it must overlap. Add this region to the filesystem.
|
||||
final GATKBAMFileSpan fileSpan = index.getSpanOverlapping(bin);
|
||||
final GATKBAMFileSpan fileSpan = indexData.getSpanOverlapping(bin);
|
||||
|
||||
if(!fileSpan.isEmpty()) {
|
||||
// File format is binary in little endian; start of region, end of region, num chunks, then the chunks themselves.
|
||||
|
|
|
|||
|
|
@ -47,7 +47,7 @@ import java.util.NoSuchElementException;
|
|||
public class BAMScheduler implements Iterator<FilePointer> {
|
||||
private final SAMDataSource dataSource;
|
||||
|
||||
private final Map<SAMReaderID, File> indexFiles = new HashMap<SAMReaderID,File>();
|
||||
private final Map<SAMReaderID,GATKBAMIndex> indexFiles = new HashMap<SAMReaderID,GATKBAMIndex>();
|
||||
|
||||
private FilePointer nextFilePointer = null;
|
||||
|
||||
|
|
@ -60,7 +60,7 @@ public class BAMScheduler implements Iterator<FilePointer> {
|
|||
public BAMScheduler(final SAMDataSource dataSource, final GenomeLocSortedSet loci) {
|
||||
this.dataSource = dataSource;
|
||||
for(SAMReaderID reader: dataSource.getReaderIDs())
|
||||
indexFiles.put(reader,(File)dataSource.getIndex(reader));
|
||||
indexFiles.put(reader,(GATKBAMIndex)dataSource.getIndex(reader));
|
||||
this.loci = loci;
|
||||
locusIterator = new PeekableIterator<GenomeLoc>(loci.iterator());
|
||||
if(locusIterator.hasNext())
|
||||
|
|
@ -184,11 +184,11 @@ public class BAMScheduler implements Iterator<FilePointer> {
|
|||
|
||||
/**
|
||||
* Get the next overlapping tree of bins associated with the given BAM file.
|
||||
* @param indexFiles BAM index file.
|
||||
* @param indices BAM indices.
|
||||
* @param currentLocus The actual locus for which to check overlap.
|
||||
* @return The next schedule entry overlapping with the given list of loci.
|
||||
*/
|
||||
private BAMScheduleEntry getNextOverlappingBAMScheduleEntry(final Map<SAMReaderID,File> indexFiles, final GenomeLoc currentLocus) {
|
||||
private BAMScheduleEntry getNextOverlappingBAMScheduleEntry(final Map<SAMReaderID,GATKBAMIndex> indices, final GenomeLoc currentLocus) {
|
||||
// Stale reference sequence or first invocation. (Re)create the binTreeIterator.
|
||||
if(lastReferenceSequenceLoaded == null || lastReferenceSequenceLoaded != currentLocus.getContigIndex()) {
|
||||
if(bamScheduleIterator != null)
|
||||
|
|
@ -202,7 +202,7 @@ public class BAMScheduler implements Iterator<FilePointer> {
|
|||
lociInContig.add(locus);
|
||||
}
|
||||
|
||||
bamScheduleIterator = new PeekableIterator<BAMScheduleEntry>(new BAMSchedule(indexFiles,lociInContig));
|
||||
bamScheduleIterator = new PeekableIterator<BAMScheduleEntry>(new BAMSchedule(indices,lociInContig));
|
||||
}
|
||||
|
||||
if(!bamScheduleIterator.hasNext())
|
||||
|
|
|
|||
|
|
@ -1,51 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2011, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.datasources.reads;
|
||||
|
||||
/**
|
||||
* Constants used in reading & writing BAM files
|
||||
*/
|
||||
class GATKBAMFileConstants {
|
||||
/**
|
||||
* The beginning of a BAMRecord is a fixed-size block of 8 int32s
|
||||
*/
|
||||
static final int FIXED_BLOCK_SIZE = 8 * 4;
|
||||
|
||||
/**
|
||||
* Sanity check -- we never expect BAMRecords to be as big as this.
|
||||
*/
|
||||
static final int MAXIMUM_RECORD_LENGTH = 1024 * 1024;
|
||||
|
||||
/**
|
||||
* BAM file magic number. This is what is present in the gunzipped version of the file,
|
||||
* which never exists on disk.
|
||||
*/
|
||||
|
||||
static final byte[] BAM_MAGIC = "BAM\1".getBytes();
|
||||
/**
|
||||
* BAM index file magic number.
|
||||
*/
|
||||
static final byte[] BAM_INDEX_MAGIC = "BAI\1".getBytes();
|
||||
}
|
||||
|
|
@ -44,11 +44,17 @@ import java.util.*;
|
|||
|
||||
/**
|
||||
* A basic interface for querying BAM indices.
|
||||
* Very much not thread-safe.
|
||||
*
|
||||
* @author mhanna
|
||||
* @version 0.1
|
||||
*/
|
||||
public class GATKBAMIndex {
|
||||
/**
|
||||
* BAM index file magic number.
|
||||
*/
|
||||
private static final byte[] BAM_INDEX_MAGIC = "BAI\1".getBytes();
|
||||
|
||||
/**
|
||||
* Reports the total amount of genomic data that any bin can index.
|
||||
*/
|
||||
|
|
@ -66,48 +72,64 @@ public class GATKBAMIndex {
|
|||
|
||||
private final File mFile;
|
||||
|
||||
private final List<GATKBin> bins = new ArrayList<GATKBin>();
|
||||
private final LinearIndex linearIndex;
|
||||
/**
|
||||
* Number of sequences stored in this index.
|
||||
*/
|
||||
private final int sequenceCount;
|
||||
|
||||
public GATKBAMIndex(final File file, final int referenceSequence) {
|
||||
/**
|
||||
* A cache of the starting positions of the sequences.
|
||||
*/
|
||||
private final long[] sequenceStartCache;
|
||||
|
||||
private FileInputStream fileStream;
|
||||
private FileChannel fileChannel;
|
||||
|
||||
public GATKBAMIndex(final File file) {
|
||||
mFile = file;
|
||||
// Open the file stream.
|
||||
|
||||
FileInputStream fileStream = null;
|
||||
FileChannel fileChannel = null;
|
||||
|
||||
try {
|
||||
fileStream = new FileInputStream(mFile);
|
||||
fileChannel = fileStream.getChannel();
|
||||
}
|
||||
catch (IOException exc) {
|
||||
throw new RuntimeIOException(exc.getMessage(), exc);
|
||||
}
|
||||
openIndexFile();
|
||||
|
||||
// Verify the magic number.
|
||||
seek(fileChannel,0);
|
||||
final byte[] buffer = readBytes(fileChannel,4);
|
||||
if (!Arrays.equals(buffer, GATKBAMFileConstants.BAM_INDEX_MAGIC)) {
|
||||
seek(0);
|
||||
final byte[] buffer = readBytes(4);
|
||||
if (!Arrays.equals(buffer, BAM_INDEX_MAGIC)) {
|
||||
throw new RuntimeException("Invalid file header in BAM index " + mFile +
|
||||
": " + new String(buffer));
|
||||
}
|
||||
|
||||
seek(fileChannel,4);
|
||||
seek(4);
|
||||
|
||||
final int sequenceCount = readInteger(fileChannel);
|
||||
sequenceCount = readInteger();
|
||||
|
||||
// Create a cache of the starting position of each sequence. Initialize it to -1.
|
||||
sequenceStartCache = new long[sequenceCount];
|
||||
for(int i = 1; i < sequenceCount; i++)
|
||||
sequenceStartCache[i] = -1;
|
||||
|
||||
// Seed the first element in the array with the current position.
|
||||
if(sequenceCount > 0)
|
||||
sequenceStartCache[0] = position();
|
||||
|
||||
closeIndexFile();
|
||||
}
|
||||
|
||||
public GATKBAMIndexData readReferenceSequence(final int referenceSequence) {
|
||||
openIndexFile();
|
||||
|
||||
if (referenceSequence >= sequenceCount)
|
||||
throw new ReviewedStingException("Invalid sequence number " + referenceSequence);
|
||||
|
||||
skipToSequence(fileChannel,referenceSequence);
|
||||
skipToSequence(referenceSequence);
|
||||
|
||||
int binCount = readInteger(fileChannel);
|
||||
int binCount = readInteger();
|
||||
List<GATKBin> bins = new ArrayList<GATKBin>();
|
||||
for (int binNumber = 0; binNumber < binCount; binNumber++) {
|
||||
final int indexBin = readInteger(fileChannel);
|
||||
final int nChunks = readInteger(fileChannel);
|
||||
final int indexBin = readInteger();
|
||||
final int nChunks = readInteger();
|
||||
|
||||
List<GATKChunk> chunks = new ArrayList<GATKChunk>(nChunks);
|
||||
long[] rawChunkData = readLongs(fileChannel,nChunks*2);
|
||||
long[] rawChunkData = readLongs(nChunks*2);
|
||||
for (int ci = 0; ci < nChunks; ci++) {
|
||||
final long chunkBegin = rawChunkData[ci*2];
|
||||
final long chunkEnd = rawChunkData[ci*2+1];
|
||||
|
|
@ -120,18 +142,14 @@ public class GATKBAMIndex {
|
|||
bins.set(indexBin,bin);
|
||||
}
|
||||
|
||||
final int nLinearBins = readInteger(fileChannel);
|
||||
long[] linearIndexEntries = readLongs(fileChannel,nLinearBins);
|
||||
final int nLinearBins = readInteger();
|
||||
long[] linearIndexEntries = readLongs(nLinearBins);
|
||||
|
||||
linearIndex = new LinearIndex(referenceSequence,0,linearIndexEntries);
|
||||
LinearIndex linearIndex = new LinearIndex(referenceSequence,0,linearIndexEntries);
|
||||
|
||||
try {
|
||||
fileChannel.close();
|
||||
fileStream.close();
|
||||
}
|
||||
catch (IOException exc) {
|
||||
throw new RuntimeIOException(exc.getMessage(), exc);
|
||||
}
|
||||
closeIndexFile();
|
||||
|
||||
return new GATKBAMIndexData(this,referenceSequence,bins,linearIndex);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -203,72 +221,6 @@ public class GATKBAMIndex {
|
|||
return (new GATKBin(bin).getBinNumber()-levelStart+1)*(BIN_GENOMIC_SPAN /levelSize);
|
||||
}
|
||||
|
||||
/**
|
||||
* Perform an overlapping query of all bins bounding the given location.
|
||||
* @param bin The bin over which to perform an overlapping query.
|
||||
* @return The file pointers
|
||||
*/
|
||||
public GATKBAMFileSpan getSpanOverlapping(final Bin bin) {
|
||||
if(bin == null)
|
||||
return null;
|
||||
|
||||
GATKBin gatkBin = new GATKBin(bin);
|
||||
|
||||
final int binLevel = getLevelForBin(bin);
|
||||
final int firstLocusInBin = getFirstLocusInBin(bin);
|
||||
|
||||
// Add the specified bin to the tree if it exists.
|
||||
List<GATKBin> binTree = new ArrayList<GATKBin>();
|
||||
if(gatkBin.getBinNumber() < bins.size() && bins.get(gatkBin.getBinNumber()) != null)
|
||||
binTree.add(bins.get(gatkBin.getBinNumber()));
|
||||
|
||||
int currentBinLevel = binLevel;
|
||||
while(--currentBinLevel >= 0) {
|
||||
final int binStart = getFirstBinInLevel(currentBinLevel);
|
||||
final int binWidth = getMaxAddressibleGenomicLocation()/getLevelSize(currentBinLevel);
|
||||
final int binNumber = firstLocusInBin/binWidth + binStart;
|
||||
if(binNumber < bins.size() && bins.get(binNumber) != null)
|
||||
binTree.add(bins.get(binNumber));
|
||||
}
|
||||
|
||||
List<GATKChunk> chunkList = new ArrayList<GATKChunk>();
|
||||
for(GATKBin coveringBin: binTree) {
|
||||
for(GATKChunk chunk: coveringBin.getChunkList())
|
||||
chunkList.add(chunk.clone());
|
||||
}
|
||||
|
||||
final int start = getFirstLocusInBin(bin);
|
||||
chunkList = optimizeChunkList(chunkList,linearIndex.getMinimumOffset(start));
|
||||
return new GATKBAMFileSpan(chunkList.toArray(new GATKChunk[chunkList.size()]));
|
||||
}
|
||||
|
||||
protected List<GATKChunk> optimizeChunkList(final List<GATKChunk> chunks, final long minimumOffset) {
|
||||
GATKChunk lastChunk = null;
|
||||
Collections.sort(chunks);
|
||||
final List<GATKChunk> result = new ArrayList<GATKChunk>();
|
||||
for (final GATKChunk chunk : chunks) {
|
||||
if (chunk.getChunkEnd() <= minimumOffset) {
|
||||
continue; // linear index optimization
|
||||
}
|
||||
if (result.isEmpty()) {
|
||||
result.add(chunk);
|
||||
lastChunk = chunk;
|
||||
continue;
|
||||
}
|
||||
// Coalesce chunks that are in adjacent file blocks.
|
||||
// This is a performance optimization.
|
||||
if (!lastChunk.overlaps(chunk) && !lastChunk.isAdjacentTo(chunk)) {
|
||||
result.add(chunk);
|
||||
lastChunk = chunk;
|
||||
} else {
|
||||
if (chunk.getChunkEnd() > lastChunk.getChunkEnd()) {
|
||||
lastChunk.setChunkEnd(chunk.getChunkEnd());
|
||||
}
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the possible number of bins for a given reference sequence.
|
||||
* @return How many bins could possibly be used according to this indexing scheme to index a single contig.
|
||||
|
|
@ -277,51 +229,84 @@ public class GATKBAMIndex {
|
|||
return BIN_GENOMIC_SPAN;
|
||||
}
|
||||
|
||||
protected void skipToSequence(final FileChannel fileChannel, final int sequenceIndex) {
|
||||
for (int i = 0; i < sequenceIndex; i++) {
|
||||
protected void skipToSequence(final int referenceSequence) {
|
||||
// Find the offset in the file of the last sequence whose position has been determined. Start here
|
||||
// when searching the sequence for the next value to read. (Note that sequenceStartCache[0] will always
|
||||
// be present, so no extra stopping condition is necessary.
|
||||
int sequenceIndex = referenceSequence;
|
||||
while(sequenceStartCache[sequenceIndex] == -1)
|
||||
sequenceIndex--;
|
||||
|
||||
// Advance to the most recently found position.
|
||||
seek(sequenceStartCache[sequenceIndex]);
|
||||
|
||||
for (int i = sequenceIndex; i < referenceSequence; i++) {
|
||||
sequenceStartCache[i] = position();
|
||||
|
||||
// System.out.println("# Sequence TID: " + i);
|
||||
final int nBins = readInteger(fileChannel);
|
||||
final int nBins = readInteger();
|
||||
// System.out.println("# nBins: " + nBins);
|
||||
for (int j = 0; j < nBins; j++) {
|
||||
final int bin = readInteger(fileChannel);
|
||||
final int nChunks = readInteger(fileChannel);
|
||||
final int bin = readInteger();
|
||||
final int nChunks = readInteger();
|
||||
// System.out.println("# bin[" + j + "] = " + bin + ", nChunks = " + nChunks);
|
||||
skipBytes(fileChannel, 16 * nChunks);
|
||||
skipBytes(16 * nChunks);
|
||||
}
|
||||
final int nLinearBins = readInteger(fileChannel);
|
||||
final int nLinearBins = readInteger();
|
||||
// System.out.println("# nLinearBins: " + nLinearBins);
|
||||
skipBytes(fileChannel, 8 * nLinearBins);
|
||||
skipBytes(8 * nLinearBins);
|
||||
}
|
||||
|
||||
sequenceStartCache[referenceSequence] = position();
|
||||
}
|
||||
|
||||
private void openIndexFile() {
|
||||
try {
|
||||
fileStream = new FileInputStream(mFile);
|
||||
fileChannel = fileStream.getChannel();
|
||||
}
|
||||
catch (IOException exc) {
|
||||
throw new ReviewedStingException("Unable to open index file " + mFile, exc);
|
||||
}
|
||||
}
|
||||
|
||||
private void closeIndexFile() {
|
||||
try {
|
||||
fileChannel.close();
|
||||
fileStream.close();
|
||||
}
|
||||
catch (IOException exc) {
|
||||
throw new ReviewedStingException("Unable to close index file " + mFile, exc);
|
||||
}
|
||||
}
|
||||
|
||||
private static final int INT_SIZE_IN_BYTES = Integer.SIZE / 8;
|
||||
private static final int LONG_SIZE_IN_BYTES = Long.SIZE / 8;
|
||||
|
||||
private byte[] readBytes(final FileChannel fileChannel, int count) {
|
||||
private byte[] readBytes(int count) {
|
||||
ByteBuffer buffer = getBuffer(count);
|
||||
read(fileChannel,buffer);
|
||||
read(buffer);
|
||||
buffer.flip();
|
||||
byte[] contents = new byte[count];
|
||||
buffer.get(contents);
|
||||
return contents;
|
||||
}
|
||||
|
||||
private int readInteger(final FileChannel fileChannel) {
|
||||
private int readInteger() {
|
||||
ByteBuffer buffer = getBuffer(INT_SIZE_IN_BYTES);
|
||||
read(fileChannel,buffer);
|
||||
read(buffer);
|
||||
buffer.flip();
|
||||
return buffer.getInt();
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads an array of <count> longs from the file channel, returning the results as an array.
|
||||
* @param fileChannel The file backing the schedule.
|
||||
* @param count Number of longs to read.
|
||||
* @return An array of longs. Size of array should match count.
|
||||
*/
|
||||
private long[] readLongs(final FileChannel fileChannel, final int count) {
|
||||
private long[] readLongs(final int count) {
|
||||
ByteBuffer buffer = getBuffer(count*LONG_SIZE_IN_BYTES);
|
||||
read(fileChannel,buffer);
|
||||
read(buffer);
|
||||
buffer.flip();
|
||||
long[] result = new long[count];
|
||||
for(int i = 0; i < count; i++)
|
||||
|
|
@ -329,7 +314,7 @@ public class GATKBAMIndex {
|
|||
return result;
|
||||
}
|
||||
|
||||
private void read(final FileChannel fileChannel, final ByteBuffer buffer) {
|
||||
private void read(final ByteBuffer buffer) {
|
||||
try {
|
||||
fileChannel.read(buffer);
|
||||
}
|
||||
|
|
@ -356,7 +341,7 @@ public class GATKBAMIndex {
|
|||
return buffer;
|
||||
}
|
||||
|
||||
private void skipBytes(final FileChannel fileChannel, final int count) {
|
||||
private void skipBytes(final int count) {
|
||||
try {
|
||||
fileChannel.position(fileChannel.position() + count);
|
||||
}
|
||||
|
|
@ -365,7 +350,7 @@ public class GATKBAMIndex {
|
|||
}
|
||||
}
|
||||
|
||||
private void seek(final FileChannel fileChannel, final int position) {
|
||||
private void seek(final long position) {
|
||||
try {
|
||||
fileChannel.position(position);
|
||||
}
|
||||
|
|
@ -373,4 +358,17 @@ public class GATKBAMIndex {
|
|||
throw new ReviewedStingException("Index: unable to reposition of file channel.");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Retrieve the position from the current file channel.
|
||||
* @return position of the current file channel.
|
||||
*/
|
||||
private long position() {
|
||||
try {
|
||||
return fileChannel.position();
|
||||
}
|
||||
catch (IOException exc) {
|
||||
throw new ReviewedStingException("Unable to read position from index file " + mFile, exc);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,124 @@
|
|||
/*
|
||||
* Copyright (c) 2011, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.datasources.reads;
|
||||
|
||||
import net.sf.samtools.Bin;
|
||||
import net.sf.samtools.GATKBAMFileSpan;
|
||||
import net.sf.samtools.GATKBin;
|
||||
import net.sf.samtools.GATKChunk;
|
||||
import net.sf.samtools.LinearIndex;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Stores and processes a single reference worth of GATK data.
|
||||
*/
|
||||
public class GATKBAMIndexData {
|
||||
private final GATKBAMIndex index;
|
||||
private final int referenceSequence;
|
||||
private final List<GATKBin> bins;
|
||||
private final LinearIndex linearIndex;
|
||||
|
||||
public GATKBAMIndexData(final GATKBAMIndex index, final int referenceSequence, final List<GATKBin> bins, final LinearIndex linearIndex) {
|
||||
this.index = index;
|
||||
this.referenceSequence = referenceSequence;
|
||||
this.bins = bins;
|
||||
this.linearIndex = linearIndex;
|
||||
}
|
||||
|
||||
public int getReferenceSequence() {
|
||||
return referenceSequence;
|
||||
}
|
||||
|
||||
/**
|
||||
* Perform an overlapping query of all bins bounding the given location.
|
||||
* @param bin The bin over which to perform an overlapping query.
|
||||
* @return The file pointers
|
||||
*/
|
||||
public GATKBAMFileSpan getSpanOverlapping(final Bin bin) {
|
||||
if(bin == null)
|
||||
return null;
|
||||
|
||||
GATKBin gatkBin = new GATKBin(bin);
|
||||
|
||||
final int binLevel = index.getLevelForBin(bin);
|
||||
final int firstLocusInBin = index.getFirstLocusInBin(bin);
|
||||
|
||||
// Add the specified bin to the tree if it exists.
|
||||
List<GATKBin> binTree = new ArrayList<GATKBin>();
|
||||
if(gatkBin.getBinNumber() < bins.size() && bins.get(gatkBin.getBinNumber()) != null)
|
||||
binTree.add(bins.get(gatkBin.getBinNumber()));
|
||||
|
||||
int currentBinLevel = binLevel;
|
||||
while(--currentBinLevel >= 0) {
|
||||
final int binStart = index.getFirstBinInLevel(currentBinLevel);
|
||||
final int binWidth = index.getMaxAddressibleGenomicLocation()/index.getLevelSize(currentBinLevel);
|
||||
final int binNumber = firstLocusInBin/binWidth + binStart;
|
||||
if(binNumber < bins.size() && bins.get(binNumber) != null)
|
||||
binTree.add(bins.get(binNumber));
|
||||
}
|
||||
|
||||
List<GATKChunk> chunkList = new ArrayList<GATKChunk>();
|
||||
for(GATKBin coveringBin: binTree) {
|
||||
for(GATKChunk chunk: coveringBin.getChunkList())
|
||||
chunkList.add(chunk.clone());
|
||||
}
|
||||
|
||||
final int start = index.getFirstLocusInBin(bin);
|
||||
chunkList = optimizeChunkList(chunkList,linearIndex.getMinimumOffset(start));
|
||||
return new GATKBAMFileSpan(chunkList.toArray(new GATKChunk[chunkList.size()]));
|
||||
}
|
||||
|
||||
private List<GATKChunk> optimizeChunkList(final List<GATKChunk> chunks, final long minimumOffset) {
|
||||
GATKChunk lastChunk = null;
|
||||
Collections.sort(chunks);
|
||||
final List<GATKChunk> result = new ArrayList<GATKChunk>();
|
||||
for (final GATKChunk chunk : chunks) {
|
||||
if (chunk.getChunkEnd() <= minimumOffset) {
|
||||
continue; // linear index optimization
|
||||
}
|
||||
if (result.isEmpty()) {
|
||||
result.add(chunk);
|
||||
lastChunk = chunk;
|
||||
continue;
|
||||
}
|
||||
// Coalesce chunks that are in adjacent file blocks.
|
||||
// This is a performance optimization.
|
||||
if (!lastChunk.overlaps(chunk) && !lastChunk.isAdjacentTo(chunk)) {
|
||||
result.add(chunk);
|
||||
lastChunk = chunk;
|
||||
} else {
|
||||
if (chunk.getChunkEnd() > lastChunk.getChunkEnd()) {
|
||||
lastChunk.setChunkEnd(chunk.getChunkEnd());
|
||||
}
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
@ -85,7 +85,7 @@ public class SAMDataSource {
|
|||
/**
|
||||
* Store BAM indices for each reader present.
|
||||
*/
|
||||
private final Map<SAMReaderID,Object> bamIndices = new HashMap<SAMReaderID,Object>();
|
||||
private final Map<SAMReaderID,GATKBAMIndex> bamIndices = new HashMap<SAMReaderID,GATKBAMIndex>();
|
||||
|
||||
/**
|
||||
* How far along is each reader?
|
||||
|
|
@ -292,9 +292,8 @@ public class SAMDataSource {
|
|||
if(enableLowMemorySharding) {
|
||||
for(SAMReaderID id: readerIDs) {
|
||||
File indexFile = findIndexFile(id.samFile);
|
||||
if(indexFile != null) {
|
||||
bamIndices.put(id,indexFile);
|
||||
}
|
||||
if(indexFile != null)
|
||||
bamIndices.put(id,new GATKBAMIndex(indexFile));
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -424,6 +423,9 @@ public class SAMDataSource {
|
|||
|
||||
/**
|
||||
* Gets the index for a particular reader. Always preloaded.
|
||||
* TODO: Should return object of type GATKBAMIndex, but cannot because there
|
||||
* TODO: is no parent class of both BAMIndex and GATKBAMIndex. Change when new
|
||||
* TODO: sharding system goes live.
|
||||
* @param id Id of the reader.
|
||||
* @return The index. Will preload the index if necessary.
|
||||
*/
|
||||
|
|
|
|||
|
|
@ -29,6 +29,7 @@ import net.sf.samtools.SAMSequenceRecord;
|
|||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.commandline.CommandLineProgram;
|
||||
import org.broadinstitute.sting.commandline.Input;
|
||||
import org.broadinstitute.sting.commandline.Output;
|
||||
import org.broadinstitute.sting.gatk.datasources.reads.FilePointer;
|
||||
import org.broadinstitute.sting.gatk.datasources.reads.LowMemoryIntervalSharder;
|
||||
import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource;
|
||||
|
|
@ -42,6 +43,7 @@ import org.broadinstitute.sting.utils.text.ListFileUtils;
|
|||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.PrintStream;
|
||||
import java.math.BigInteger;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
|
@ -61,6 +63,9 @@ public class FindLargeShards extends CommandLineProgram {
|
|||
@Input(fullName = "intervals", shortName = "L", doc = "A list of genomic intervals over which to operate. Can be explicitly specified on the command line or in a file.",required=false)
|
||||
public List<String> intervals = null;
|
||||
|
||||
@Output(required=false)
|
||||
public PrintStream out = System.out;
|
||||
|
||||
/**
|
||||
* The square of the sum of all uncompressed data. Based on the BAM spec, the size of this could be
|
||||
* up to (2^64)^2.
|
||||
|
|
@ -99,7 +104,7 @@ public class FindLargeShards extends CommandLineProgram {
|
|||
intervalSortedSet.add(genomeLocParser.createGenomeLoc(entry.getSequenceName(),1,entry.getSequenceLength()));
|
||||
}
|
||||
|
||||
logger.info(String.format("Calculating mean and variance: Contig\tRegion.Start\tRegion.Stop\tSize"));
|
||||
logger.info(String.format("PROGRESS: Calculating mean and variance: Contig\tRegion.Start\tRegion.Stop\tSize"));
|
||||
|
||||
LowMemoryIntervalSharder sharder = new LowMemoryIntervalSharder(dataSource,intervalSortedSet);
|
||||
while(sharder.hasNext()) {
|
||||
|
|
@ -115,7 +120,7 @@ public class FindLargeShards extends CommandLineProgram {
|
|||
|
||||
if(numberOfShards % 1000 == 0) {
|
||||
GenomeLoc boundingRegion = getBoundingRegion(filePointer,genomeLocParser);
|
||||
logger.info(String.format("Calculating mean and variance: %s\t%d\t%d\t%d",boundingRegion.getContig(),boundingRegion.getStart(),boundingRegion.getStop(),size));
|
||||
logger.info(String.format("PROGRESS: Calculating mean and variance: %s\t%d\t%d\t%d",boundingRegion.getContig(),boundingRegion.getStart(),boundingRegion.getStop(),size));
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -127,7 +132,9 @@ public class FindLargeShards extends CommandLineProgram {
|
|||
|
||||
// Crank through the shards again, this time reporting on the shards significantly larger than the mean.
|
||||
long threshold = mean + stddev*5;
|
||||
logger.warn(String.format("Searching for large shards: Contig\tRegion.Start\tRegion.Stop\tSize"));
|
||||
logger.warn(String.format("PROGRESS: Searching for large shards: Contig\tRegion.Start\tRegion.Stop\tSize"));
|
||||
out.printf("Contig\tRegion.Start\tRegion.Stop\tSize%n");
|
||||
|
||||
sharder = new LowMemoryIntervalSharder(dataSource,intervalSortedSet);
|
||||
while(sharder.hasNext()) {
|
||||
FilePointer filePointer = sharder.next();
|
||||
|
|
@ -142,11 +149,11 @@ public class FindLargeShards extends CommandLineProgram {
|
|||
|
||||
if(filePointer.size() <= threshold) {
|
||||
if(numberOfShards % 1000 == 0)
|
||||
logger.info(String.format("%s\t%d\t%d\t%d",boundingRegion.getContig(),boundingRegion.getStart(),boundingRegion.getStop(),size));
|
||||
logger.info(String.format("PROGRESS: Searching for large shards: %s\t%d\t%d\t%d",boundingRegion.getContig(),boundingRegion.getStart(),boundingRegion.getStop(),size));
|
||||
continue;
|
||||
}
|
||||
|
||||
logger.warn(String.format("FOUND LARGE SHARD: %s\t%d\t%d\t%d",boundingRegion.getContig(),boundingRegion.getStart(),boundingRegion.getStop(),size));
|
||||
out.printf("%s\t%d\t%d\t%d%n",boundingRegion.getContig(),boundingRegion.getStart(),boundingRegion.getStop(),size);
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
|
|
|||
|
|
@ -57,7 +57,7 @@ public class GATKBAMIndexUnitTest extends BaseTest {
|
|||
SAMSequenceDictionary sequenceDictionary = reader.getFileHeader().getSequenceDictionary();
|
||||
reader.close();
|
||||
|
||||
bamIndex = new GATKBAMIndex(bamIndexFile,0);
|
||||
bamIndex = new GATKBAMIndex(bamIndexFile);
|
||||
}
|
||||
|
||||
@Test
|
||||
|
|
|
|||
Loading…
Reference in New Issue