Efficiency improvement requested by the Picard team in IndexedFastaSequenceFile: improve the memory efficiency
(and loading time) of long reference sequences by better controlling the input buffer size. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@3665 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
ed71e53dd4
commit
2953c9f069
|
|
@ -24,6 +24,11 @@ import java.util.Iterator;
|
|||
* the ReferenceSequenceFile for old-style, stateful lookups and a direct getter.
|
||||
*/
|
||||
public class IndexedFastaSequenceFile implements ReferenceSequenceFile {
|
||||
/**
|
||||
* Size of the read buffer.
|
||||
*/
|
||||
private static final int BUFFER_SIZE = 128 * 1024;
|
||||
|
||||
/**
|
||||
* Stores the main fasta file.
|
||||
*/
|
||||
|
|
@ -101,7 +106,7 @@ public class IndexedFastaSequenceFile implements ReferenceSequenceFile {
|
|||
dictionaryName += ".dict";
|
||||
final File dictionary = new File(dictionaryName);
|
||||
if (!dictionary.exists())
|
||||
throw new PicardException("Unable to load .dict file. Dictionary is required for the indexed fasta reader.");
|
||||
throw new PicardException("Unable to load .dict file. Dictionary is required for the indexed fasta reader.");
|
||||
|
||||
IoUtil.assertFileIsReadable(dictionary);
|
||||
|
||||
|
|
@ -208,27 +213,45 @@ public class IndexedFastaSequenceFile implements ReferenceSequenceFile {
|
|||
|
||||
final int basesPerLine = indexEntry.getBasesPerLine();
|
||||
final int bytesPerLine = indexEntry.getBytesPerLine();
|
||||
final int terminatorLength = bytesPerLine - basesPerLine;
|
||||
|
||||
final long startOffset = ((start-1)/basesPerLine)*bytesPerLine + (start-1)%basesPerLine;
|
||||
final long stopOffset = ((stop-1)/basesPerLine)*bytesPerLine + (stop-1)%basesPerLine;
|
||||
final int size = (int)(stopOffset-startOffset)+1;
|
||||
long startOffset = ((start-1)/basesPerLine)*bytesPerLine + (start-1)%basesPerLine;
|
||||
|
||||
ByteBuffer channelBuffer = ByteBuffer.allocate(size);
|
||||
try {
|
||||
channel.read(channelBuffer,indexEntry.getLocation()+startOffset);
|
||||
}
|
||||
catch(IOException ex) {
|
||||
throw new PicardException("Unable to map FASTA file into memory.");
|
||||
}
|
||||
// Allocate a 128K buffer for reading in sequence data.
|
||||
ByteBuffer channelBuffer = ByteBuffer.allocate(BUFFER_SIZE);
|
||||
|
||||
channelBuffer.position(0);
|
||||
channelBuffer.limit(Math.min(basesPerLine-(int)startOffset%bytesPerLine,size));
|
||||
while(targetBuffer.position() < length) {
|
||||
// If the bufferOffset is currently within the eol characters in the string, push the bufferOffset forward to the next printable character.
|
||||
startOffset += Math.max((int)(startOffset%bytesPerLine - basesPerLine + 1),0);
|
||||
|
||||
while( channelBuffer.hasRemaining() ) {
|
||||
targetBuffer.put(channelBuffer);
|
||||
try {
|
||||
startOffset += channel.read(channelBuffer,indexEntry.getLocation()+startOffset);
|
||||
}
|
||||
catch(IOException ex) {
|
||||
throw new PicardException("Unable to map FASTA file into memory.");
|
||||
}
|
||||
|
||||
channelBuffer.limit(Math.min(channelBuffer.limit()+bytesPerLine,size));
|
||||
channelBuffer.position(Math.min(channelBuffer.position()+bytesPerLine-basesPerLine,size));
|
||||
// Reset the buffer for outbound transfers.
|
||||
channelBuffer.flip();
|
||||
|
||||
// Calculate the size of the next run of bases based on the contents we've already retrieved.
|
||||
final int positionInContig = (int)start-1+targetBuffer.position();
|
||||
final int nextBaseSpan = Math.min(basesPerLine-positionInContig%basesPerLine,length-targetBuffer.position());
|
||||
// Cap the bytes to transfer by limiting the nextBaseSpan to the size of the channel buffer.
|
||||
int bytesToTransfer = Math.min(nextBaseSpan,channelBuffer.capacity());
|
||||
|
||||
channelBuffer.limit(channelBuffer.position()+bytesToTransfer);
|
||||
|
||||
while(channelBuffer.hasRemaining()) {
|
||||
targetBuffer.put(channelBuffer);
|
||||
|
||||
bytesToTransfer = Math.min(basesPerLine,length-targetBuffer.position());
|
||||
channelBuffer.limit(Math.min(channelBuffer.position()+bytesToTransfer+terminatorLength,channelBuffer.capacity()));
|
||||
channelBuffer.position(Math.min(channelBuffer.position()+terminatorLength,channelBuffer.capacity()));
|
||||
}
|
||||
|
||||
// Reset the buffer for inbound transfers.
|
||||
channelBuffer.flip();
|
||||
}
|
||||
|
||||
return new ReferenceSequence( contig, sequenceDictionary.getSequenceIndex(contig), target );
|
||||
|
|
|
|||
Loading…
Reference in New Issue