Efficiency improvement requested by the Picard team in IndexedFastaSequenceFile: improve the memory efficiency

(and loading time) of long reference sequences by better controlling the input buffer size. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3665 348d0f76-0448-11de-a6fe-93d51630548a
2010-06-29 07:22:07 +00:00 · 2010-06-29 07:22:07 +00:00 · 2953c9f069
parent ed71e53dd4
commit 2953c9f069
1 changed files with 40 additions and 17 deletions
--- a/java/src/org/broadinstitute/sting/utils/fasta/IndexedFastaSequenceFile.java
+++ b/java/src/org/broadinstitute/sting/utils/fasta/IndexedFastaSequenceFile.java
@ -24,6 +24,11 @@ import java.util.Iterator;
 * the ReferenceSequenceFile for old-style, stateful lookups and a direct getter.
 */
 public class IndexedFastaSequenceFile implements ReferenceSequenceFile {
+    /**
+     * Size of the read buffer.
+     */
+    private static final int BUFFER_SIZE = 128 * 1024;
+
    /**
     * Stores the main fasta file.
     */
@ -101,7 +106,7 @@ public class IndexedFastaSequenceFile implements ReferenceSequenceFile {
        dictionaryName += ".dict";
        final File dictionary = new File(dictionaryName);
        if (!dictionary.exists())
-            throw new PicardException("Unable to load .dict file.  Dictionary is required for the indexed fasta reader.");    
+            throw new PicardException("Unable to load .dict file.  Dictionary is required for the indexed fasta reader.");

        IoUtil.assertFileIsReadable(dictionary);

@ -208,27 +213,45 @@ public class IndexedFastaSequenceFile implements ReferenceSequenceFile {

        final int basesPerLine = indexEntry.getBasesPerLine();
        final int bytesPerLine = indexEntry.getBytesPerLine();
+        final int terminatorLength = bytesPerLine - basesPerLine;

-        final long startOffset = ((start-1)/basesPerLine)*bytesPerLine + (start-1)%basesPerLine;
-        final long stopOffset = ((stop-1)/basesPerLine)*bytesPerLine + (stop-1)%basesPerLine;
-        final int size = (int)(stopOffset-startOffset)+1;
+        long startOffset = ((start-1)/basesPerLine)*bytesPerLine + (start-1)%basesPerLine;

-        ByteBuffer channelBuffer = ByteBuffer.allocate(size);
-        try {
-            channel.read(channelBuffer,indexEntry.getLocation()+startOffset);
-        }
-        catch(IOException ex) {
-            throw new PicardException("Unable to map FASTA file into memory.");
-        }
+        // Allocate a 128K buffer for reading in sequence data.
+        ByteBuffer channelBuffer = ByteBuffer.allocate(BUFFER_SIZE);

-        channelBuffer.position(0);
-        channelBuffer.limit(Math.min(basesPerLine-(int)startOffset%bytesPerLine,size));
+        while(targetBuffer.position() < length) {
+            // If the bufferOffset is currently within the eol characters in the string, push the bufferOffset forward to the next printable character.
+            startOffset += Math.max((int)(startOffset%bytesPerLine - basesPerLine + 1),0);

-        while( channelBuffer.hasRemaining() ) {
-            targetBuffer.put(channelBuffer);
+            try {
+                 startOffset += channel.read(channelBuffer,indexEntry.getLocation()+startOffset);
+            }
+            catch(IOException ex) {
+                throw new PicardException("Unable to map FASTA file into memory.");
+            }

-            channelBuffer.limit(Math.min(channelBuffer.limit()+bytesPerLine,size));
-            channelBuffer.position(Math.min(channelBuffer.position()+bytesPerLine-basesPerLine,size));
+            // Reset the buffer for outbound transfers.
+            channelBuffer.flip();
+
+            // Calculate the size of the next run of bases based on the contents we've already retrieved.
+            final int positionInContig = (int)start-1+targetBuffer.position();
+            final int nextBaseSpan = Math.min(basesPerLine-positionInContig%basesPerLine,length-targetBuffer.position());
+            // Cap the bytes to transfer by limiting the nextBaseSpan to the size of the channel buffer.
+            int bytesToTransfer = Math.min(nextBaseSpan,channelBuffer.capacity());
+
+            channelBuffer.limit(channelBuffer.position()+bytesToTransfer);
+
+            while(channelBuffer.hasRemaining()) {
+                targetBuffer.put(channelBuffer);
+
+                bytesToTransfer = Math.min(basesPerLine,length-targetBuffer.position());
+                channelBuffer.limit(Math.min(channelBuffer.position()+bytesToTransfer+terminatorLength,channelBuffer.capacity()));
+                channelBuffer.position(Math.min(channelBuffer.position()+terminatorLength,channelBuffer.capacity()));
+            }
+
+            // Reset the buffer for inbound transfers.
+            channelBuffer.flip();
        }

        return new ReferenceSequence( contig, sequenceDictionary.getSequenceIndex(contig), target );