From f4b6afb42cfee1d845010c6e8b4eaa08a9c857db Mon Sep 17 00:00:00 2001 From: hanna Date: Fri, 9 Oct 2009 04:45:46 +0000 Subject: [PATCH] JVM issue id 5092131 (http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=5092131) was causing OOM issues with the new mmapping fasta file reader during large jobs. Temporarily reverting the reader until a workaround can be found. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1801 348d0f76-0448-11de-a6fe-93d51630548a --- .../utils/fasta/IndexedFastaSequenceFile.java | 93 ++++++++++++++----- .../fasta/IndexedFastaSequenceFileTest.java | 30 +----- 2 files changed, 71 insertions(+), 52 deletions(-) diff --git a/java/src/org/broadinstitute/sting/utils/fasta/IndexedFastaSequenceFile.java b/java/src/org/broadinstitute/sting/utils/fasta/IndexedFastaSequenceFile.java index c6c9c2251..236ac4125 100755 --- a/java/src/org/broadinstitute/sting/utils/fasta/IndexedFastaSequenceFile.java +++ b/java/src/org/broadinstitute/sting/utils/fasta/IndexedFastaSequenceFile.java @@ -47,6 +47,7 @@ public class IndexedFastaSequenceFile implements ReferenceSequenceFile { public IndexedFastaSequenceFile(File file) throws FileNotFoundException { this.file = file; + // TODO: Add support for gzipped files in = new FileInputStream(file); channel = in.getChannel(); @@ -101,8 +102,6 @@ public class IndexedFastaSequenceFile implements ReferenceSequenceFile { /** * Loads the index for the fasta, if present. Throws an exception if now present. - * @param fastaFile FASTA file to load. - * @throws FileNotFoundException if FASTA file cannot be found. */ private void loadIndex( File fastaFile ) throws FileNotFoundException { File indexFile = new File(fastaFile.getAbsolutePath() + ".fai"); @@ -110,7 +109,7 @@ public class IndexedFastaSequenceFile implements ReferenceSequenceFile { throw new PicardException(String.format("Unable to load fasta index file %s. "+ "Please create it using 'samtools faidx'.",indexFile.getAbsolutePath())); index = new FastaSequenceIndex(indexFile); - reset(); + indexIterator = index.iterator(); } /** @@ -160,6 +159,10 @@ public class IndexedFastaSequenceFile implements ReferenceSequenceFile { public ReferenceSequence getSubsequenceAt( String contig, long start, long stop ) { if(start > stop) throw new PicardException(String.format("Malformed query; start point %d lies after end point %d",start,stop)); + if(start > Integer.MAX_VALUE) + throw new PicardException("Due to current ReferenceSequence limitations, a start point larger than Integer.MAX_VALUE cannot be loaded."); + if(stop - start + 1 > Integer.MAX_VALUE) + throw new PicardException("Due to current ReferenceSequence limitations, a region larger than Integer.MAX_VALUE cannot be loaded."); FastaSequenceIndexEntry indexEntry = index.getIndexEntry(contig); @@ -168,37 +171,77 @@ public class IndexedFastaSequenceFile implements ReferenceSequenceFile { int length = (int)(stop - start + 1); - byte[] target = new byte[length]; - ByteBuffer targetBuffer = ByteBuffer.wrap(target); - final int basesPerLine = indexEntry.getBasesPerLine(); final int bytesPerLine = indexEntry.getBytesPerLine(); - final long startOffset = ((start-1)/basesPerLine)*bytesPerLine + (start-1)%basesPerLine; - final long stopOffset = ((stop-1)/basesPerLine)*bytesPerLine + (stop-1)%basesPerLine; - final int size = (int)(stopOffset-startOffset); + // Start reading at the closest start-of-line to our data. + long readStart = indexEntry.getLocation() + ((start-1) / basesPerLine) * bytesPerLine; + int dataOfInterestStart = (int)((start-1) % basesPerLine); - ByteBuffer channelBuffer; + byte[] accumulator = new byte[length]; + int nextAccumulatorSlot = 0; + + while(length > 0) { + ByteBuffer buffer = ByteBuffer.allocateDirect(BUFFERSIZE); + try { + channel.read(buffer, readStart); + readStart += BUFFERSIZE; + } + catch( IOException ex ) { + throw new PicardException("Unable to read directly from fasta", ex); + } + + final int basesTransferred = transferToBuffer( buffer, + dataOfInterestStart, + accumulator, + nextAccumulatorSlot, + length ); + + nextAccumulatorSlot += basesTransferred; + length -= basesTransferred; + dataOfInterestStart = 0; + } + + return new ReferenceSequence( contig, sequenceDictionary.getSequenceIndex(contig), accumulator ); + } + + /** + * Transfers the contents of the given ByteBuffer to the given byte array, discarding + * line breaks at regular intervals. Copies as many as length bases, depending on the + * buffer size. Returns the number of bytes actually copied. + * @param source The source ByteBuffer. + * @param sourceStart The starting position to copy within the byte buffer + * @param target Destination for the data + * @param targetStart Index into target buffer. + * @param length How much data to move. + * @return How many bytes were actually transferred. + */ + private int transferToBuffer( ByteBuffer source, + int sourceStart, + byte[] target, + int targetStart, + int length ) { + source.position(sourceStart); + int basesRead = 0; + CharsetDecoder decoder = Charset.forName("US-ASCII").newDecoder(); + + Scanner scanner = null; try { - channelBuffer = channel.map(FileChannel.MapMode.READ_ONLY, - indexEntry.getLocation()+startOffset, - stopOffset-startOffset+1); + scanner = new Scanner(decoder.decode(source).toString()); } - catch(IOException ex) { - throw new PicardException("Unable to map FASTA file into memory."); + catch(CharacterCodingException ex) { + throw new PicardException("Malformed subsequence",ex); } - channelBuffer.position(0); - channelBuffer.limit(Math.min(basesPerLine-(int)startOffset%bytesPerLine,size+1)); + while( scanner.hasNext() && basesRead < length ) { + String sourceLine = scanner.nextLine(); + byte[] sourceData = sourceLine.getBytes(); + int basesToTransfer = Math.min(sourceData.length,length - basesRead); + System.arraycopy(sourceData,0,target,targetStart+basesRead,basesToTransfer); - while( channelBuffer.hasRemaining() ) { - targetBuffer.put(channelBuffer); - - channelBuffer.limit(Math.min(channelBuffer.limit()+bytesPerLine,size+1)); - channelBuffer.position(Math.min(channelBuffer.position()+bytesPerLine-basesPerLine,size+1)); + basesRead += basesToTransfer; } - - return new ReferenceSequence( contig, sequenceDictionary.getSequenceIndex(contig), target ); + return basesRead; } /** @@ -213,7 +256,7 @@ public class IndexedFastaSequenceFile implements ReferenceSequenceFile { @Override public void reset() { - indexIterator = index.iterator(); + // TODO: FOR MATT TO IMPL. } public String toString() { diff --git a/java/test/org/broadinstitute/sting/utils/fasta/IndexedFastaSequenceFileTest.java b/java/test/org/broadinstitute/sting/utils/fasta/IndexedFastaSequenceFileTest.java index 2a05b307b..8273a3b48 100755 --- a/java/test/org/broadinstitute/sting/utils/fasta/IndexedFastaSequenceFileTest.java +++ b/java/test/org/broadinstitute/sting/utils/fasta/IndexedFastaSequenceFileTest.java @@ -153,7 +153,7 @@ public class IndexedFastaSequenceFileTest extends BaseTest { } @Test(expected= PicardException.class) - public void testReadPastEndOfContig() { + public void testReadPastEndOfContig() { long startTime = System.currentTimeMillis(); try { ReferenceSequence sequence = sequenceFile.getSubsequenceAt("chrM",16800,16900); @@ -265,36 +265,12 @@ public class IndexedFastaSequenceFileTest extends BaseTest { ReferenceSequence sequence = sequenceFile.nextSequence(); long endTime = System.currentTimeMillis(); - Assert.assertEquals("Sequence contig is not correct", "chr1", sequence.getName()); - Assert.assertEquals("Sequence contig index is not correct", 1, sequence.getContigIndex()); - Assert.assertEquals("Sequence size is not correct", expectedSequence.length(), sequence.length()); + Assert.assertEquals("Sequence contig is not correct", sequence.getName(), "chr1"); + Assert.assertEquals("Sequence contig index is not correct", sequence.getContigIndex(), 1); Assert.assertEquals("chr1 is incorrect", StringUtil.bytesToString(expectedSequence.getBases()), StringUtil.bytesToString(sequence.getBases()) ); System.err.printf("testNextElementOfIterator runtime: %dms%n", (endTime - startTime)) ; } - - @Test - public void testReset() { - ReferenceSequenceFile originalSequenceFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(new File(sequenceFileName)); - // Skip past the first one and load the second one. - ReferenceSequence expectedSequence = originalSequenceFile.nextSequence(); - - long startTime = System.currentTimeMillis(); - sequenceFile.nextSequence(); - sequenceFile.nextSequence(); - sequenceFile.reset(); - ReferenceSequence sequence = sequenceFile.nextSequence(); - long endTime = System.currentTimeMillis(); - - Assert.assertEquals("Sequence contig is not correct", "chrM", sequence.getName()); - Assert.assertEquals("Sequence contig index is not correct", 0, sequence.getContigIndex()); - Assert.assertEquals("Sequence size is not correct", expectedSequence.length(), sequence.length()); - Assert.assertEquals("chrM is incorrect", - StringUtil.bytesToString(expectedSequence.getBases()), - StringUtil.bytesToString(sequence.getBases()) ); - - System.err.printf("testReset runtime: %dms%n", (endTime - startTime)) ; - } }