From 22a11e41e16b826a70ad7bfda7a5a5f5552d4ff7 Mon Sep 17 00:00:00 2001 From: hanna Date: Tue, 12 Apr 2011 23:49:58 +0000 Subject: [PATCH] Rewrite of GATKBAMIndex to avoid mmaps causing false reports of heavy memory usage. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5620 348d0f76-0448-11de-a6fe-93d51630548a --- .../reads/BAMIndexBinIterator.java | 328 ------------ .../gatk/datasources/reads/BAMSchedule.java | 11 +- .../gatk/datasources/reads/BAMScheduler.java | 15 +- .../gatk/datasources/reads/GATKBAMIndex.java | 473 ++++-------------- .../datasources/reads/IntervalSharder.java | 2 +- .../reads/LowMemoryIntervalSharder.java | 1 + .../gatk/datasources/reads/SAMDataSource.java | 8 +- .../reads/GATKBAMIndexUnitTest.java | 2 +- 8 files changed, 127 insertions(+), 713 deletions(-) delete mode 100644 java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMIndexBinIterator.java diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMIndexBinIterator.java b/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMIndexBinIterator.java deleted file mode 100644 index ef6f5f65e..000000000 --- a/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMIndexBinIterator.java +++ /dev/null @@ -1,328 +0,0 @@ -/* - * Copyright (c) 2011, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.datasources.reads; - -import net.sf.samtools.GATKBin; -import net.sf.samtools.GATKChunk; -import net.sf.samtools.util.CloseableIterator; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; - -import java.io.File; -import java.io.FileInputStream; -import java.io.FileOutputStream; -import java.io.IOException; -import java.nio.ByteBuffer; -import java.nio.ByteOrder; -import java.nio.channels.FileChannel; -import java.util.ArrayList; -import java.util.List; -import java.util.NoSuchElementException; - -/** - * Creates an 'index of the index' for a particular reference sequence - * within the BAM file, for easier whole-BAM-file traversal. - * file. - */ -public class BAMIndexBinIterator { - /** - * The source of index data. - */ - private final GATKBAMIndex index; - - /** - * The file storing the index data. - */ - private final File indexFile; - - /** - * File storing index metadata. - */ - private final File metaIndexFile; - - /** - * Reference sequence that temporary file is based on. - */ - private final int referenceSequence; - - /** - * Position of the linear index in the BAM on disk for the given reference sequence. - */ - private final long linearIndexPosition; - - /** - * Size of the linear index for the given reference sequence. - */ - private final int linearIndexSize; - - /** - * Size of a long in bytes. - */ - private static final int LONG_SIZE_IN_BYTES = Long.SIZE / 8; - - public BAMIndexBinIterator(final GATKBAMIndex index, final File indexFile, final int referenceSequence) { - this.index = index; - this.indexFile = indexFile; - this.referenceSequence = referenceSequence; - - index.seek(4); - - final int sequenceCount = index.readInteger(); - - if (referenceSequence >= sequenceCount) - throw new ReviewedStingException(String.format("Reference sequence past end of genome; reference sequence = %d, sequence count = %d",referenceSequence,sequenceCount)); - - index.skipToSequence(referenceSequence); - - int binCount = index.readInteger(); - - try { - metaIndexFile = File.createTempFile("bammetaindex."+referenceSequence,null); - metaIndexFile.deleteOnExit(); - - FileOutputStream metaIndex = new FileOutputStream(metaIndexFile); - FileChannel metaIndexChannel = metaIndex.getChannel(); - - // zero out the contents of the file. Arrays of primitives in java are always zeroed out by default. - byte[] emptyContents = new byte[GATKBAMIndex.MAX_BINS*LONG_SIZE_IN_BYTES]; // byte array is zeroed out by default. - metaIndexChannel.write(ByteBuffer.wrap(emptyContents)); - - ByteBuffer positionBuffer = ByteBuffer.allocate(emptyContents.length); - positionBuffer.order(ByteOrder.LITTLE_ENDIAN); - - // Write the positions of the bins in the index - for (int binNumber = 0; binNumber < binCount; binNumber++) { - long position = index.position(); - - final int indexBin = index.readInteger(); - - metaIndexChannel.position(indexBin*LONG_SIZE_IN_BYTES); - positionBuffer.putLong(position); - positionBuffer.flip(); - - metaIndexChannel.write(positionBuffer); - positionBuffer.flip(); - - final int nChunks = index.readInteger(); - index.skipBytes(16 * nChunks); - } - - // Read the size of the linear index and the first position. - linearIndexSize = index.readInteger(); - linearIndexPosition = index.position(); - - metaIndexChannel.close(); - metaIndex.close(); - } - catch(IOException ex) { - throw new ReviewedStingException("Unable to write BAM metaindex",ex); - } - } - - public void close() { - metaIndexFile.delete(); - } - - public CloseableIterator getIteratorOverLevel(int level) { - return new LevelIterator(level); - } - - /** - * Gets the linear index entry for the specified genomic start. - * @param genomicStart Starting location for which to search. - * @return Linear index entry corresponding to this genomic start. - */ - public long getLinearIndexEntry(final int genomicStart) { - final int indexOffset = (genomicStart-1 >> 14); - // If the number of bins is exceeded, don't perform any trimming. - if(indexOffset >= linearIndexSize) - return 0L; - - FileInputStream indexInputStream; - try { - indexInputStream = new FileInputStream(indexFile); - } - catch(IOException ex) { - throw new ReviewedStingException("Unable to open index file for reading"); - } - - try { - indexInputStream.getChannel().position(linearIndexPosition+(indexOffset*LONG_SIZE_IN_BYTES)); - } - catch(IOException ex) { - throw new ReviewedStingException("Unable to position index file for reading"); - } - - ByteBuffer indexReader = ByteBuffer.allocate(LONG_SIZE_IN_BYTES); - indexReader.order(ByteOrder.LITTLE_ENDIAN); - - try { - indexInputStream.getChannel().read(indexReader); - } - catch(IOException ex) { - throw new ReviewedStingException("Unable to position index file for reading"); - } - - indexReader.flip(); - final long linearIndexEntry = indexReader.getLong(); - - try { - indexInputStream.close(); - } - catch(IOException ex) { - throw new ReviewedStingException("Unable to close index file."); - } - - return linearIndexEntry; - } - - private class LevelIterator implements CloseableIterator { - /** - * The raw BAM index file with unordered bins. - */ - private final FileInputStream indexInputStream; - - /** - * The index metafile, with pointers to ordered bins. - */ - private final FileInputStream metaIndexInputStream; - - /** - * The first and last bins in the level. - */ - private final int firstBinNumber, lastBinNumber; - - /** - * The current bin in the index. - */ - private int currentBinNumber; - - /** - * Position of the most recent chunk data. - */ - private GATKBin nextBin = null; - - public LevelIterator(final int level) { - try { - indexInputStream = new FileInputStream(indexFile); - } - catch(IOException ex) { - throw new ReviewedStingException("Unable to open index file for reading"); - } - try { - metaIndexInputStream = new FileInputStream(metaIndexFile); - } - catch(IOException ex) { - throw new ReviewedStingException("Unable to open index metafile for reading"); - } - - firstBinNumber = GATKBAMIndex.getFirstBinInLevel(level); - lastBinNumber = firstBinNumber + index.getLevelSize(level) - 1; - - currentBinNumber = firstBinNumber - 1; - advance(); - } - - public void close() { - try { - indexInputStream.close(); - } - catch(IOException ex) { - throw new ReviewedStingException("Unable to close index file."); - } - try { - metaIndexInputStream.close(); - } - catch(IOException ex) { - throw new ReviewedStingException("Unable to close index metafile"); - } - } - - public boolean hasNext() { - return nextBin != null; - } - - public GATKBin next() { - if(!hasNext()) - throw new NoSuchElementException("Out of elements in BAMIndexBinIterator"); - GATKBin currentPosition = nextBin; - advance(); - return currentPosition; - } - - public void remove() { throw new UnsupportedOperationException("Cannot remove from a LevelIterator"); } - - private void advance() { - ByteBuffer indexFilePositionDecoder = ByteBuffer.allocate(LONG_SIZE_IN_BYTES*2); - indexFilePositionDecoder.order(ByteOrder.LITTLE_ENDIAN); - - nextBin = null; - try { - indexFilePositionDecoder.limit(LONG_SIZE_IN_BYTES); - while(nextBin == null && currentBinNumber < lastBinNumber) { - currentBinNumber++; - metaIndexInputStream.getChannel().position(currentBinNumber*LONG_SIZE_IN_BYTES); - metaIndexInputStream.getChannel().read(indexFilePositionDecoder); - indexFilePositionDecoder.flip(); - long currentPosition = indexFilePositionDecoder.getLong(); - indexFilePositionDecoder.flip(); - - if(currentPosition != 0) { - indexInputStream.getChannel().position(currentPosition); - indexInputStream.getChannel().read(indexFilePositionDecoder); - - indexFilePositionDecoder.flip(); - int binNumber = indexFilePositionDecoder.getInt(); - if(binNumber != currentBinNumber) - throw new ReviewedStingException("Index file and metaindex file are out of sync."); - - int nChunks = indexFilePositionDecoder.getInt(); - GATKChunk[] chunks = new GATKChunk[nChunks]; - - indexFilePositionDecoder.limit(LONG_SIZE_IN_BYTES*2); - indexFilePositionDecoder.clear(); - - for (int ci = 0; ci < nChunks; ci++) { - indexInputStream.getChannel().read(indexFilePositionDecoder); - - indexFilePositionDecoder.flip(); - final long chunkBegin = indexFilePositionDecoder.getLong(); - final long chunkEnd = indexFilePositionDecoder.getLong(); - chunks[ci] = new GATKChunk(chunkBegin, chunkEnd); - - indexFilePositionDecoder.flip(); - } - - nextBin = new GATKBin(referenceSequence,binNumber); - nextBin.setChunkList(chunks); - } - } - } - catch(IOException ex) { - throw new ReviewedStingException("Unable to close index metafile"); - } - - } - } -} diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMSchedule.java b/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMSchedule.java index b86187405..3aa88f147 100644 --- a/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMSchedule.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMSchedule.java @@ -88,10 +88,10 @@ public class BAMSchedule implements CloseableIterator { /** * Create a new BAM schedule based on the given index. - * @param indices Mapping from readers to indices. + * @param indexFiles Index files. * @param intervals List of */ - public BAMSchedule(final Map indices, final List intervals) { + public BAMSchedule(final Map indexFiles, final List intervals) { if(intervals.isEmpty()) throw new ReviewedStingException("Tried to write schedule for empty interval list."); @@ -99,11 +99,10 @@ public class BAMSchedule implements CloseableIterator { createScheduleFile(); - readerIDs.addAll(indices.keySet()); + readerIDs.addAll(indexFiles.keySet()); for(final SAMReaderID reader: readerIDs) { - GATKBAMIndex index = indices.get(reader); - BAMIndexContent indexContent = index.getQueryResults(referenceSequence); + GATKBAMIndex index = new GATKBAMIndex(indexFiles.get(reader),referenceSequence); int currentBinInLowestLevel = GATKBAMIndex.getFirstBinInLevel(GATKBAMIndex.getNumIndexLevels()-1); Iterator locusIterator = intervals.iterator(); @@ -133,7 +132,7 @@ public class BAMSchedule implements CloseableIterator { // Code at this point knows that the current bin is neither before nor after the current locus, // so it must overlap. Add this region to the filesystem. - final GATKBAMFileSpan fileSpan = index.getSpanOverlapping(indexContent,bin); + final GATKBAMFileSpan fileSpan = index.getSpanOverlapping(bin); if(!fileSpan.isEmpty()) { // File format is binary in little endian; start of region, end of region, num chunks, then the chunks themselves. diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java b/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java index c94609d54..b3cdbc31d 100644 --- a/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java @@ -30,6 +30,7 @@ import org.apache.log4j.Logger; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocSortedSet; +import java.io.File; import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; @@ -46,7 +47,7 @@ import java.util.NoSuchElementException; public class BAMScheduler implements Iterator { private final SAMDataSource dataSource; - private final Map indices = new HashMap(); + private final Map indexFiles = new HashMap(); private FilePointer nextFilePointer = null; @@ -59,7 +60,7 @@ public class BAMScheduler implements Iterator { public BAMScheduler(final SAMDataSource dataSource, final GenomeLocSortedSet loci) { this.dataSource = dataSource; for(SAMReaderID reader: dataSource.getReaderIDs()) - indices.put(reader,(GATKBAMIndex)dataSource.getIndex(reader)); + indexFiles.put(reader,(File)dataSource.getIndex(reader)); this.loci = loci; locusIterator = new PeekableIterator(loci.iterator()); if(locusIterator.hasNext()) @@ -104,7 +105,7 @@ public class BAMScheduler implements Iterator { int coveredRegionStop = Integer.MAX_VALUE; GenomeLoc coveredRegion = null; - BAMScheduleEntry scheduleEntry = getNextOverlappingBAMScheduleEntry(indices,currentLocus); + BAMScheduleEntry scheduleEntry = getNextOverlappingBAMScheduleEntry(indexFiles,currentLocus); // No overlapping data at all. if(scheduleEntry != null) { @@ -117,7 +118,7 @@ public class BAMScheduler implements Iterator { else { // Always create a file span, whether there was covered data or not. If there was no covered data, then the binTree is empty. //System.out.printf("Shard: index file = %s; reference sequence = %d; ",index.getIndexFile(),currentLocus.getContigIndex()); - for(SAMReaderID reader: indices.keySet()) + for(SAMReaderID reader: indexFiles.keySet()) nextFilePointer.addFileSpans(reader,new GATKBAMFileSpan()); } @@ -183,11 +184,11 @@ public class BAMScheduler implements Iterator { /** * Get the next overlapping tree of bins associated with the given BAM file. - * @param indices BAM index representation. + * @param indexFiles BAM index file. * @param currentLocus The actual locus for which to check overlap. * @return The next schedule entry overlapping with the given list of loci. */ - private BAMScheduleEntry getNextOverlappingBAMScheduleEntry(final Map indices, final GenomeLoc currentLocus) { + private BAMScheduleEntry getNextOverlappingBAMScheduleEntry(final Map indexFiles, final GenomeLoc currentLocus) { // Stale reference sequence or first invocation. (Re)create the binTreeIterator. if(lastReferenceSequenceLoaded == null || lastReferenceSequenceLoaded != currentLocus.getContigIndex()) { if(bamScheduleIterator != null) @@ -201,7 +202,7 @@ public class BAMScheduler implements Iterator { lociInContig.add(locus); } - bamScheduleIterator = new PeekableIterator(new BAMSchedule(indices,lociInContig)); + bamScheduleIterator = new PeekableIterator(new BAMSchedule(indexFiles,lociInContig)); } if(!bamScheduleIterator.hasNext()) diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/reads/GATKBAMIndex.java b/java/src/org/broadinstitute/sting/gatk/datasources/reads/GATKBAMIndex.java index 458fb4ede..ad758719e 100644 --- a/java/src/org/broadinstitute/sting/gatk/datasources/reads/GATKBAMIndex.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/reads/GATKBAMIndex.java @@ -23,24 +23,20 @@ */ package org.broadinstitute.sting.gatk.datasources.reads; -import net.sf.samtools.BAMIndex; -import net.sf.samtools.BAMIndexMetaData; import net.sf.samtools.Bin; -import net.sf.samtools.BrowseableBAMIndex; import net.sf.samtools.GATKBAMFileSpan; import net.sf.samtools.GATKBin; -import net.sf.samtools.GATKBinList; import net.sf.samtools.GATKChunk; import net.sf.samtools.LinearIndex; import net.sf.samtools.SAMException; -import net.sf.samtools.SAMSequenceDictionary; import net.sf.samtools.util.RuntimeIOException; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import java.io.File; import java.io.FileInputStream; import java.io.IOException; +import java.nio.ByteBuffer; import java.nio.ByteOrder; -import java.nio.MappedByteBuffer; import java.nio.channels.FileChannel; import java.util.*; @@ -50,7 +46,7 @@ import java.util.*; * @author mhanna * @version 0.1 */ -public class GATKBAMIndex implements BAMIndex, BrowseableBAMIndex { +public class GATKBAMIndex { /** * Reports the total amount of genomic data that any bin can index. */ @@ -66,46 +62,75 @@ public class GATKBAMIndex implements BAMIndex, BrowseableBAMIndex { */ public static final int MAX_BINS = 37450; // =(8^6-1)/7+1 - public static final int MAX_LINEAR_INDEX_SIZE = MAX_BINS+1-LEVEL_STARTS[LEVEL_STARTS.length-1]; - private final File mFile; - private final MappedByteBuffer mFileBuffer; - private SAMSequenceDictionary mBamDictionary = null; + private final List bins = new ArrayList(); + private final LinearIndex linearIndex; - public GATKBAMIndex(final File file, final SAMSequenceDictionary dictionary) { + public GATKBAMIndex(final File file, final int referenceSequence) { mFile = file; - mBamDictionary = dictionary; // Open the file stream. - try { - FileInputStream fileStream = new FileInputStream(mFile); - FileChannel fileChannel = fileStream.getChannel(); - mFileBuffer = fileChannel.map(FileChannel.MapMode.READ_ONLY, 0L, fileChannel.size()); - mFileBuffer.order(ByteOrder.LITTLE_ENDIAN); - fileChannel.close(); - fileStream.close(); + FileInputStream fileStream = null; + FileChannel fileChannel = null; + + try { + fileStream = new FileInputStream(mFile); + fileChannel = fileStream.getChannel(); } catch (IOException exc) { throw new RuntimeIOException(exc.getMessage(), exc); } // Verify the magic number. - seek(0); + seek(fileChannel,0); final byte[] buffer = new byte[4]; - readBytes(buffer); + readBytes(fileChannel,buffer); if (!Arrays.equals(buffer, GATKBAMFileConstants.BAM_INDEX_MAGIC)) { throw new RuntimeException("Invalid file header in BAM index " + mFile + ": " + new String(buffer)); } - } - /** - * Gets the file backing this index. - * @return The index file. - */ - public File getIndexFile() { - return mFile; + seek(fileChannel,4); + + final int sequenceCount = readInteger(fileChannel); + + if (referenceSequence >= sequenceCount) + throw new ReviewedStingException("Invalid sequence number " + referenceSequence); + + skipToSequence(fileChannel,referenceSequence); + + int binCount = readInteger(fileChannel); + for (int binNumber = 0; binNumber < binCount; binNumber++) { + final int indexBin = readInteger(fileChannel); + final int nChunks = readInteger(fileChannel); + List chunks = new ArrayList(nChunks); + for (int ci = 0; ci < nChunks; ci++) { + final long chunkBegin = readLong(fileChannel); + final long chunkEnd = readLong(fileChannel); + chunks.add(new GATKChunk(chunkBegin, chunkEnd)); + } + GATKBin bin = new GATKBin(referenceSequence, indexBin); + bin.setChunkList(chunks.toArray(new GATKChunk[chunks.size()])); + while(indexBin >= bins.size()) + bins.add(null); + bins.set(indexBin,bin); + } + + final int nLinearBins = readInteger(fileChannel); + long[] linearIndexEntries = new long[nLinearBins]; + for(int linearIndexOffset = 0; linearIndexOffset < nLinearBins; linearIndexOffset++) + linearIndexEntries[linearIndexOffset] = readLong(fileChannel); + + linearIndex = new LinearIndex(referenceSequence,0,linearIndexEntries); + + try { + fileChannel.close(); + fileStream.close(); + } + catch (IOException exc) { + throw new RuntimeIOException(exc.getMessage(), exc); + } } /** @@ -142,7 +167,6 @@ public class GATKBAMIndex implements BAMIndex, BrowseableBAMIndex { * @param bin The bin for which to determine the level. * @return the level associated with the given bin number. */ - @Override public int getLevelForBin(final Bin bin) { GATKBin gatkBin = new GATKBin(bin); if(gatkBin.getBinNumber() >= MAX_BINS) @@ -171,7 +195,6 @@ public class GATKBAMIndex implements BAMIndex, BrowseableBAMIndex { * @param bin The bin to test. * @return The last position that the given bin can represent. */ - @Override public int getLastLocusInBin(final Bin bin) { final int level = getLevelForBin(bin); final int levelStart = LEVEL_STARTS[level]; @@ -179,164 +202,32 @@ public class GATKBAMIndex implements BAMIndex, BrowseableBAMIndex { return (new GATKBin(bin).getBinNumber()-levelStart+1)*(BIN_GENOMIC_SPAN /levelSize); } - public int getNumberOfReferences() { - seek(4); - return readInteger(); - } - - /** - * Use to get close to the unmapped reads at the end of a BAM file. - * @return The file offset of the first record in the last linear bin, or -1 - * if there are no elements in linear bins (i.e. no mapped reads). - */ - public long getStartOfLastLinearBin() { - seek(4); - - final int sequenceCount = readInteger(); - // Because no reads may align to the last sequence in the sequence dictionary, - // grab the last element of the linear index for each sequence, and return - // the last one from the last sequence that has one. - long lastLinearIndexPointer = -1; - for (int i = 0; i < sequenceCount; i++) { - // System.out.println("# Sequence TID: " + i); - final int nBins = readInteger(); - // System.out.println("# nBins: " + nBins); - for (int j1 = 0; j1 < nBins; j1++) { - // Skip bin # - skipBytes(4); - final int nChunks = readInteger(); - // Skip chunks - skipBytes(16 * nChunks); - } - final int nLinearBins = readInteger(); - if (nLinearBins > 0) { - // Skip to last element of list of linear bins - skipBytes(8 * (nLinearBins - 1)); - lastLinearIndexPointer = readLong(); - } - } - - return lastLinearIndexPointer; - } - - /** - * Gets meta data for the given reference including information about number of aligned, unaligned, and noCoordinate records - * @param reference the reference of interest - * @return meta data for the reference - */ - public BAMIndexMetaData getMetaData(int reference) { - throw new UnsupportedOperationException("Cannot retrieve metadata for GATKBAMIndex"); - } - - /** - * Returns count of records unassociated with any reference. Call before the index file is closed - * - * @return meta data at the end of the bam index that indicates count of records holding no coordinates - * or null if no meta data (old index format) - */ - public Long getNoCoordinateCount() { - - seek(4); - final int sequenceCount = readInteger(); - - skipToSequence(sequenceCount); - try { // in case of old index file without meta data - return readLong(); - } catch (Exception e) { - return null; - } - } - - /** - * Get list of regions of BAM file that may contain SAMRecords for the given range - * @param referenceIndex sequence of desired SAMRecords - * @param startPos 1-based start of the desired interval, inclusive - * @param endPos 1-based end of the desired interval, inclusive - * @return the virtual file position. Each pair is the first and last virtual file position - * in a range that can be scanned to find SAMRecords that overlap the given positions. - */ - @Override - public GATKBAMFileSpan getSpanOverlapping(final int referenceIndex, final int startPos, final int endPos) { - BAMIndexContent queryResults = getQueryResults(referenceIndex); - - if(queryResults == null) - return null; - - GATKBinList overlappingBins = getBinsOverlapping(referenceIndex,startPos,endPos); - - // System.out.println("# Sequence target TID: " + referenceIndex); - List bins = new ArrayList(); - for(GATKBin bin: queryResults.getBins()) { - if (overlappingBins.getBins().get(bin.getBinNumber())) - bins.add(bin); - } - - if (bins.isEmpty()) { - return null; - } - - List chunkList = new ArrayList(); - for(GATKBin bin: bins) { - for(GATKChunk chunk: bin.getChunkList()) - chunkList.add(chunk.clone()); - } - - if (chunkList.isEmpty()) { - return null; - } - - chunkList = optimizeChunkList(chunkList,queryResults.getLinearIndex().getMinimumOffset(startPos)); - return new GATKBAMFileSpan(chunkList.toArray(new GATKChunk[chunkList.size()])); - } - /** * Perform an overlapping query of all bins bounding the given location. * @param bin The bin over which to perform an overlapping query. * @return The file pointers */ - @Override public GATKBAMFileSpan getSpanOverlapping(final Bin bin) { if(bin == null) return null; GATKBin gatkBin = new GATKBin(bin); - final int referenceSequence = gatkBin.getReferenceSequence(); - BAMIndexContent indexQuery = getQueryResults(referenceSequence); - - return getSpanOverlapping(indexQuery,bin); - } - - /** - * Perform an overlapping query of all bins bounding the given location. - * @param bin The bin over which to perform an overlapping query. - * @return The file pointers - */ - public GATKBAMFileSpan getSpanOverlapping(final BAMIndexContent indexContent, final Bin bin) { - if(bin == null) - return null; - - GATKBin gatkBin = new GATKBin(bin); - - if(indexContent == null) - return null; - final int binLevel = getLevelForBin(bin); final int firstLocusInBin = getFirstLocusInBin(bin); // Add the specified bin to the tree if it exists. List binTree = new ArrayList(); - if(indexContent.containsBin(gatkBin)) - binTree.add(indexContent.getBins().getBin(gatkBin.getBinNumber())); + if(gatkBin.getBinNumber() < bins.size() && bins.get(gatkBin.getBinNumber()) != null) + binTree.add(bins.get(gatkBin.getBinNumber())); int currentBinLevel = binLevel; while(--currentBinLevel >= 0) { final int binStart = getFirstBinInLevel(currentBinLevel); final int binWidth = getMaxAddressibleGenomicLocation()/getLevelSize(currentBinLevel); final int binNumber = firstLocusInBin/binWidth + binStart; - GATKBin parentBin = indexContent.getBins().getBin(binNumber); - if(parentBin != null && indexContent.containsBin(parentBin)) - binTree.add(parentBin); + if(binNumber < bins.size() && bins.get(binNumber) != null) + binTree.add(bins.get(binNumber)); } List chunkList = new ArrayList(); @@ -346,173 +237,10 @@ public class GATKBAMIndex implements BAMIndex, BrowseableBAMIndex { } final int start = getFirstLocusInBin(bin); - chunkList = optimizeChunkList(chunkList,indexContent.getLinearIndex().getMinimumOffset(start)); + chunkList = optimizeChunkList(chunkList,linearIndex.getMinimumOffset(start)); return new GATKBAMFileSpan(chunkList.toArray(new GATKChunk[chunkList.size()])); } - public GATKBAMFileSpan getContentsOfBin(final Bin bin) { - if(bin == null) - return null; - - GATKBin gatkBin = new GATKBin(bin); - - BAMIndexContent indexQuery = getQueryResults(gatkBin.getReferenceSequence()); - - if(indexQuery == null) - return null; - - GATKBin queriedBin = indexQuery.getBins().getBin(gatkBin.getBinNumber()); - - return queriedBin != null ? new GATKBAMFileSpan(queriedBin.getChunkList()) : null; - } - - /** - * Retrieves the linear index for the given reference sequence. - * @param referenceSequence Reference sequence number for which to retrieve the reference. - * @return The linear index for the given reference sequence. - */ - public LinearIndex getLinearIndex(int referenceSequence) { - return getQueryResults(referenceSequence).getLinearIndex(); - } - - /** - * Get a list of bins in the BAM file that may contain SAMRecords for the given range. - * @param referenceIndex sequence of desired SAMRecords - * @param startPos 1-based start of the desired interval, inclusive - * @param endPos 1-based end of the desired interval, inclusive - * @return a list of bins that contain relevant data. - */ - public GATKBinList getBinsOverlapping(final int referenceIndex, final int startPos, final int endPos) { - final BitSet regionBins = regionToBins(startPos,endPos); - if (regionBins == null) { - return null; - } - return new GATKBinList(referenceIndex,regionBins); - } - - protected BAMIndexContent query(final int referenceSequence, final int startPos, final int endPos) { - seek(4); - - List metaDataChunks = new ArrayList(); - - final int sequenceCount = readInteger(); - - if (referenceSequence >= sequenceCount) { - return null; - } - - final BitSet regionBins = regionToBins(startPos, endPos); - if (regionBins == null) { - return null; - } - - skipToSequence(referenceSequence); - - int binCount = readInteger(); - boolean metaDataSeen = false; - GATKBin[] bins = new GATKBin[getMaxBinNumberForReference(referenceSequence) +1]; - for (int binNumber = 0; binNumber < binCount; binNumber++) { - final int indexBin = readInteger(); - final int nChunks = readInteger(); - List chunks = new ArrayList(nChunks); - // System.out.println("# bin[" + i + "] = " + indexBin + ", nChunks = " + nChunks); - GATKChunk lastChunk = null; - if (regionBins.get(indexBin)) { - for (int ci = 0; ci < nChunks; ci++) { - final long chunkBegin = readLong(); - final long chunkEnd = readLong(); - lastChunk = new GATKChunk(chunkBegin, chunkEnd); - chunks.add(lastChunk); - } - } else if (indexBin == MAX_BINS) { - // meta data - build the bin so that the count of bins is correct; - // but don't attach meta chunks to the bin, or normal queries will be off - for (int ci = 0; ci < nChunks; ci++) { - final long chunkBegin = readLong(); - final long chunkEnd = readLong(); - lastChunk = new GATKChunk(chunkBegin, chunkEnd); - metaDataChunks.add(lastChunk); - } - metaDataSeen = true; - continue; // don't create a Bin - } else { - skipBytes(16 * nChunks); - } - GATKBin bin = new GATKBin(referenceSequence, indexBin); - bin.setChunkList(chunks.toArray(new GATKChunk[chunks.size()])); - bins[indexBin] = bin; - } - - final int nLinearBins = readInteger(); - - final int regionLinearBinStart = LinearIndex.convertToLinearIndexOffset(startPos); - final int regionLinearBinStop = endPos > 0 ? LinearIndex.convertToLinearIndexOffset(endPos) : nLinearBins-1; - final int actualStop = Math.min(regionLinearBinStop, nLinearBins -1); - - long[] linearIndexEntries = new long[0]; - if (regionLinearBinStart < nLinearBins) { - linearIndexEntries = new long[actualStop-regionLinearBinStart+1]; - skipBytes(8 * regionLinearBinStart); - for(int linearBin = regionLinearBinStart; linearBin <= actualStop; linearBin++) - linearIndexEntries[linearBin-regionLinearBinStart] = readLong(); - } - - final LinearIndex linearIndex = new LinearIndex(referenceSequence,regionLinearBinStart,linearIndexEntries); - - return new BAMIndexContent(referenceSequence, bins, binCount - (metaDataSeen? 1 : 0), linearIndex); - } - - /** - * The maxiumum bin number for a reference sequence of a given length - */ - static int getMaxBinNumberForSequenceLength(int sequenceLength) { - return getFirstBinInLevel(getNumIndexLevels() - 1) + (sequenceLength >> 14); - // return 4680 + (sequenceLength >> 14); // note 4680 = getFirstBinInLevel(getNumIndexLevels() - 1) - } - - /** - * Looks up the cached BAM query results if they're still in the cache and not expired. Otherwise, - * retrieves the cache results from disk. - * @param referenceIndex The reference to load. CachingBAMFileIndex only stores index data for entire references. - * @return The index information for this reference. - */ - public BAMIndexContent getQueryResults(final int referenceIndex) { - // If not in the cache, attempt to load it from disk. - return query(referenceIndex,1,-1); - } - - /** - * Gets the possible number of bins for a given reference sequence. - * @return How many bins could possibly be used according to this indexing scheme to index a single contig. - */ - protected int getMaxAddressibleGenomicLocation() { - return BIN_GENOMIC_SPAN; - } - - /** - * Get candidate bins for the specified region - * @param startPos 1-based start of target region, inclusive. - * @param endPos 1-based end of target region, inclusive. - * @return bit set for each bin that may contain SAMRecords in the target region. - */ - protected BitSet regionToBins(final int startPos, final int endPos) { - final int maxPos = 0x1FFFFFFF; - final int start = (startPos <= 0) ? 0 : (startPos-1) & maxPos; - final int end = (endPos <= 0) ? maxPos : (endPos-1) & maxPos; - if (start > end) { - return null; - } - int k; - final BitSet bitSet = new BitSet(MAX_BINS); - bitSet.set(0); - for (k = 1 + (start>>26); k <= 1 + (end>>26); ++k) bitSet.set(k); - for (k = 9 + (start>>23); k <= 9 + (end>>23); ++k) bitSet.set(k); - for (k = 73 + (start>>20); k <= 73 + (end>>20); ++k) bitSet.set(k); - for (k = 585 + (start>>17); k <= 585 + (end>>17); ++k) bitSet.set(k); - for (k = 4681 + (start>>14); k <= 4681 + (end>>14); ++k) bitSet.set(k); - return bitSet; - } - protected List optimizeChunkList(final List chunks, final long minimumOffset) { GATKChunk lastChunk = null; Collections.sort(chunks); @@ -541,61 +269,76 @@ public class GATKBAMIndex implements BAMIndex, BrowseableBAMIndex { } /** - * The maximum possible bin number for this reference sequence. - * This is based on the maximum coordinate position of the reference - * which is based on the size of the reference + * Gets the possible number of bins for a given reference sequence. + * @return How many bins could possibly be used according to this indexing scheme to index a single contig. */ - private int getMaxBinNumberForReference(final int reference) { - try { - final int sequenceLength = mBamDictionary.getSequence(reference).getSequenceLength(); - return getMaxBinNumberForSequenceLength(sequenceLength); - } catch (Exception e) { - return MAX_BINS; - } - } + protected int getMaxAddressibleGenomicLocation() { + return BIN_GENOMIC_SPAN; + } - protected void skipToSequence(final int sequenceIndex) { + protected void skipToSequence(final FileChannel fileChannel, final int sequenceIndex) { for (int i = 0; i < sequenceIndex; i++) { // System.out.println("# Sequence TID: " + i); - final int nBins = readInteger(); + final int nBins = readInteger(fileChannel); // System.out.println("# nBins: " + nBins); for (int j = 0; j < nBins; j++) { - final int bin = readInteger(); - final int nChunks = readInteger(); + final int bin = readInteger(fileChannel); + final int nChunks = readInteger(fileChannel); // System.out.println("# bin[" + j + "] = " + bin + ", nChunks = " + nChunks); - skipBytes(16 * nChunks); + skipBytes(fileChannel, 16 * nChunks); } - final int nLinearBins = readInteger(); + final int nLinearBins = readInteger(fileChannel); // System.out.println("# nLinearBins: " + nLinearBins); - skipBytes(8 * nLinearBins); + skipBytes(fileChannel, 8 * nLinearBins); } } - private void readBytes(final byte[] bytes) { - mFileBuffer.get(bytes); + private void readBytes(final FileChannel fileChannel, final byte[] bytes) { + ByteBuffer buffer = ByteBuffer.wrap(bytes); + try { + fileChannel.read(buffer); + } + catch(IOException ex) { + throw new ReviewedStingException("Index: unable to read bytes from index file."); + } } - protected int readInteger() { - return mFileBuffer.getInt(); + private static final int INT_SIZE_IN_BYTES = Integer.SIZE / 8; + private static final int LONG_SIZE_IN_BYTES = Long.SIZE / 8; + + private ByteBuffer wrapBuffer(final byte[] bytes) { + ByteBuffer buffer = ByteBuffer.wrap(bytes); + buffer.order(ByteOrder.LITTLE_ENDIAN); + return buffer; } - protected long readLong() { - return mFileBuffer.getLong(); + private int readInteger(final FileChannel fileChannel) { + byte[] bytes = new byte[INT_SIZE_IN_BYTES]; + readBytes(fileChannel,bytes); + return wrapBuffer(bytes).getInt(); } - protected void skipBytes(final int count) { - mFileBuffer.position(mFileBuffer.position() + count); + private long readLong(final FileChannel fileChannel) { + byte[] bytes = new byte[LONG_SIZE_IN_BYTES]; + readBytes(fileChannel,bytes); + return wrapBuffer(bytes).getLong(); } - protected void seek(final int position) { - mFileBuffer.position(position); + private void skipBytes(final FileChannel fileChannel, final int count) { + try { + fileChannel.position(fileChannel.position() + count); + } + catch(IOException ex) { + throw new ReviewedStingException("Index: unable to reposition file channel."); + } } - protected long position() { - return mFileBuffer.position(); - } - - @Override - public void close() { + private void seek(final FileChannel fileChannel, final int position) { + try { + fileChannel.position(position); + } + catch(IOException ex) { + throw new ReviewedStingException("Index: unable to reposition of file channel."); + } } } diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/reads/IntervalSharder.java b/java/src/org/broadinstitute/sting/gatk/datasources/reads/IntervalSharder.java index 26166c753..fc3f76ab7 100644 --- a/java/src/org/broadinstitute/sting/gatk/datasources/reads/IntervalSharder.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/reads/IntervalSharder.java @@ -140,7 +140,7 @@ public class IntervalSharder { // If this contig can't be found in the reference, skip over it. if(referenceSequence == null && contig != null) continue; - final BrowseableBAMIndex index = dataSource.getIndex(id); + final BrowseableBAMIndex index = (BrowseableBAMIndex)dataSource.getIndex(id); binMerger.addReader(id, index, referenceSequence.getSequenceIndex(), diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/reads/LowMemoryIntervalSharder.java b/java/src/org/broadinstitute/sting/gatk/datasources/reads/LowMemoryIntervalSharder.java index ba6321121..4d953dc16 100644 --- a/java/src/org/broadinstitute/sting/gatk/datasources/reads/LowMemoryIntervalSharder.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/reads/LowMemoryIntervalSharder.java @@ -45,6 +45,7 @@ public class LowMemoryIntervalSharder implements Iterator { private final GenomeLocParser parser; public LowMemoryIntervalSharder(final SAMDataSource dataSource, final GenomeLocSortedSet loci) { + System.out.println("Low memory interval sharding enabled"); wrappedIterator = new PeekableIterator(new BAMScheduler(dataSource,loci)); parser = loci.getGenomeLocParser(); } diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java b/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java index 3829636fc..ecfe15de7 100755 --- a/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java @@ -84,7 +84,7 @@ public class SAMDataSource { /** * Store BAM indices for each reader present. */ - private final Map bamIndices = new HashMap(); + private final Map bamIndices = new HashMap(); /** * How far along is each reader? @@ -292,9 +292,7 @@ public class SAMDataSource { for(SAMReaderID id: readerIDs) { File indexFile = findIndexFile(id.samFile); if(indexFile != null) { - SAMSequenceDictionary sequenceDictionary = readers.getReader(id).getFileHeader().getSequenceDictionary(); - GATKBAMIndex index = new GATKBAMIndex(indexFile,sequenceDictionary); - bamIndices.put(id,index); + bamIndices.put(id,indexFile); } } } @@ -428,7 +426,7 @@ public class SAMDataSource { * @param id Id of the reader. * @return The index. Will preload the index if necessary. */ - public BrowseableBAMIndex getIndex(final SAMReaderID id) { + public Object getIndex(final SAMReaderID id) { if(enableLowMemorySharding) return bamIndices.get(id); else { diff --git a/java/test/org/broadinstitute/sting/gatk/datasources/reads/GATKBAMIndexUnitTest.java b/java/test/org/broadinstitute/sting/gatk/datasources/reads/GATKBAMIndexUnitTest.java index fa21b9f38..55d6a908e 100644 --- a/java/test/org/broadinstitute/sting/gatk/datasources/reads/GATKBAMIndexUnitTest.java +++ b/java/test/org/broadinstitute/sting/gatk/datasources/reads/GATKBAMIndexUnitTest.java @@ -57,7 +57,7 @@ public class GATKBAMIndexUnitTest extends BaseTest { SAMSequenceDictionary sequenceDictionary = reader.getFileHeader().getSequenceDictionary(); reader.close(); - bamIndex = new GATKBAMIndex(bamIndexFile,sequenceDictionary); + bamIndex = new GATKBAMIndex(bamIndexFile,0); } @Test