was causing OOM issues with the new mmapping fasta file reader during large jobs.
Temporarily reverting the reader until a workaround can be found.


git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1801 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
hanna 2009-10-09 04:45:46 +00:00
parent 6d7f4481e4
commit f4b6afb42c
2 changed files with 71 additions and 52 deletions

View File

@ -47,6 +47,7 @@ public class IndexedFastaSequenceFile implements ReferenceSequenceFile {
public IndexedFastaSequenceFile(File file) throws FileNotFoundException {
this.file = file;
// TODO: Add support for gzipped files
in = new FileInputStream(file);
channel = in.getChannel();
@ -101,8 +102,6 @@ public class IndexedFastaSequenceFile implements ReferenceSequenceFile {
/**
* Loads the index for the fasta, if present. Throws an exception if now present.
* @param fastaFile FASTA file to load.
* @throws FileNotFoundException if FASTA file cannot be found.
*/
private void loadIndex( File fastaFile ) throws FileNotFoundException {
File indexFile = new File(fastaFile.getAbsolutePath() + ".fai");
@ -110,7 +109,7 @@ public class IndexedFastaSequenceFile implements ReferenceSequenceFile {
throw new PicardException(String.format("Unable to load fasta index file %s. "+
"Please create it using 'samtools faidx'.",indexFile.getAbsolutePath()));
index = new FastaSequenceIndex(indexFile);
reset();
indexIterator = index.iterator();
}
/**
@ -160,6 +159,10 @@ public class IndexedFastaSequenceFile implements ReferenceSequenceFile {
public ReferenceSequence getSubsequenceAt( String contig, long start, long stop ) {
if(start > stop)
throw new PicardException(String.format("Malformed query; start point %d lies after end point %d",start,stop));
if(start > Integer.MAX_VALUE)
throw new PicardException("Due to current ReferenceSequence limitations, a start point larger than Integer.MAX_VALUE cannot be loaded.");
if(stop - start + 1 > Integer.MAX_VALUE)
throw new PicardException("Due to current ReferenceSequence limitations, a region larger than Integer.MAX_VALUE cannot be loaded.");
FastaSequenceIndexEntry indexEntry = index.getIndexEntry(contig);
@ -168,37 +171,77 @@ public class IndexedFastaSequenceFile implements ReferenceSequenceFile {
int length = (int)(stop - start + 1);
byte[] target = new byte[length];
ByteBuffer targetBuffer = ByteBuffer.wrap(target);
final int basesPerLine = indexEntry.getBasesPerLine();
final int bytesPerLine = indexEntry.getBytesPerLine();
final long startOffset = ((start-1)/basesPerLine)*bytesPerLine + (start-1)%basesPerLine;
final long stopOffset = ((stop-1)/basesPerLine)*bytesPerLine + (stop-1)%basesPerLine;
final int size = (int)(stopOffset-startOffset);
// Start reading at the closest start-of-line to our data.
long readStart = indexEntry.getLocation() + ((start-1) / basesPerLine) * bytesPerLine;
int dataOfInterestStart = (int)((start-1) % basesPerLine);
ByteBuffer channelBuffer;
byte[] accumulator = new byte[length];
int nextAccumulatorSlot = 0;
while(length > 0) {
ByteBuffer buffer = ByteBuffer.allocateDirect(BUFFERSIZE);
try {
channel.read(buffer, readStart);
readStart += BUFFERSIZE;
}
catch( IOException ex ) {
throw new PicardException("Unable to read directly from fasta", ex);
}
final int basesTransferred = transferToBuffer( buffer,
dataOfInterestStart,
accumulator,
nextAccumulatorSlot,
length );
nextAccumulatorSlot += basesTransferred;
length -= basesTransferred;
dataOfInterestStart = 0;
}
return new ReferenceSequence( contig, sequenceDictionary.getSequenceIndex(contig), accumulator );
}
/**
* Transfers the contents of the given ByteBuffer to the given byte array, discarding
* line breaks at regular intervals. Copies as many as length bases, depending on the
* buffer size. Returns the number of bytes actually copied.
* @param source The source ByteBuffer.
* @param sourceStart The starting position to copy within the byte buffer
* @param target Destination for the data
* @param targetStart Index into target buffer.
* @param length How much data to move.
* @return How many bytes were actually transferred.
*/
private int transferToBuffer( ByteBuffer source,
int sourceStart,
byte[] target,
int targetStart,
int length ) {
source.position(sourceStart);
int basesRead = 0;
CharsetDecoder decoder = Charset.forName("US-ASCII").newDecoder();
Scanner scanner = null;
try {
channelBuffer = channel.map(FileChannel.MapMode.READ_ONLY,
indexEntry.getLocation()+startOffset,
stopOffset-startOffset+1);
scanner = new Scanner(decoder.decode(source).toString());
}
catch(IOException ex) {
throw new PicardException("Unable to map FASTA file into memory.");
catch(CharacterCodingException ex) {
throw new PicardException("Malformed subsequence",ex);
}
channelBuffer.position(0);
channelBuffer.limit(Math.min(basesPerLine-(int)startOffset%bytesPerLine,size+1));
while( scanner.hasNext() && basesRead < length ) {
String sourceLine = scanner.nextLine();
byte[] sourceData = sourceLine.getBytes();
int basesToTransfer = Math.min(sourceData.length,length - basesRead);
System.arraycopy(sourceData,0,target,targetStart+basesRead,basesToTransfer);
while( channelBuffer.hasRemaining() ) {
targetBuffer.put(channelBuffer);
channelBuffer.limit(Math.min(channelBuffer.limit()+bytesPerLine,size+1));
channelBuffer.position(Math.min(channelBuffer.position()+bytesPerLine-basesPerLine,size+1));
basesRead += basesToTransfer;
}
return new ReferenceSequence( contig, sequenceDictionary.getSequenceIndex(contig), target );
return basesRead;
}
/**
@ -213,7 +256,7 @@ public class IndexedFastaSequenceFile implements ReferenceSequenceFile {
@Override
public void reset() {
indexIterator = index.iterator();
// TODO: FOR MATT TO IMPL.
}
public String toString() {

View File

@ -153,7 +153,7 @@ public class IndexedFastaSequenceFileTest extends BaseTest {
}
@Test(expected= PicardException.class)
public void testReadPastEndOfContig() {
public void testReadPastEndOfContig() {
long startTime = System.currentTimeMillis();
try {
ReferenceSequence sequence = sequenceFile.getSubsequenceAt("chrM",16800,16900);
@ -265,36 +265,12 @@ public class IndexedFastaSequenceFileTest extends BaseTest {
ReferenceSequence sequence = sequenceFile.nextSequence();
long endTime = System.currentTimeMillis();
Assert.assertEquals("Sequence contig is not correct", "chr1", sequence.getName());
Assert.assertEquals("Sequence contig index is not correct", 1, sequence.getContigIndex());
Assert.assertEquals("Sequence size is not correct", expectedSequence.length(), sequence.length());
Assert.assertEquals("Sequence contig is not correct", sequence.getName(), "chr1");
Assert.assertEquals("Sequence contig index is not correct", sequence.getContigIndex(), 1);
Assert.assertEquals("chr1 is incorrect",
StringUtil.bytesToString(expectedSequence.getBases()),
StringUtil.bytesToString(sequence.getBases()) );
System.err.printf("testNextElementOfIterator runtime: %dms%n", (endTime - startTime)) ;
}
@Test
public void testReset() {
ReferenceSequenceFile originalSequenceFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(new File(sequenceFileName));
// Skip past the first one and load the second one.
ReferenceSequence expectedSequence = originalSequenceFile.nextSequence();
long startTime = System.currentTimeMillis();
sequenceFile.nextSequence();
sequenceFile.nextSequence();
sequenceFile.reset();
ReferenceSequence sequence = sequenceFile.nextSequence();
long endTime = System.currentTimeMillis();
Assert.assertEquals("Sequence contig is not correct", "chrM", sequence.getName());
Assert.assertEquals("Sequence contig index is not correct", 0, sequence.getContigIndex());
Assert.assertEquals("Sequence size is not correct", expectedSequence.length(), sequence.length());
Assert.assertEquals("chrM is incorrect",
StringUtil.bytesToString(expectedSequence.getBases()),
StringUtil.bytesToString(sequence.getBases()) );
System.err.printf("testReset runtime: %dms%n", (endTime - startTime)) ;
}
}