From 182626576fca38b3a93a2ebcae789e9a2dda43ae Mon Sep 17 00:00:00 2001 From: hanna Date: Wed, 15 Apr 2009 13:46:56 +0000 Subject: [PATCH] Basic indexed fasta POC in place. Requires a more complete implementation of the ReferenceSequenceFile interface, and much more testing. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@425 348d0f76-0448-11de-a6fe-93d51630548a --- .../sting/utils/fasta/FastaSequenceIndex.java | 15 ++ .../utils/fasta/IndexedFastaSequenceFile.java | 142 ++++++++++++++++++ .../fasta/IndexedFastaSequenceFileTest.java | 111 ++++++++++++++ 3 files changed, 268 insertions(+) create mode 100755 java/src/org/broadinstitute/sting/utils/fasta/IndexedFastaSequenceFile.java create mode 100755 java/test/org/broadinstitute/sting/utils/fasta/IndexedFastaSequenceFileTest.java diff --git a/java/src/org/broadinstitute/sting/utils/fasta/FastaSequenceIndex.java b/java/src/org/broadinstitute/sting/utils/fasta/FastaSequenceIndex.java index 7f33e2dba..324ed6a37 100755 --- a/java/src/org/broadinstitute/sting/utils/fasta/FastaSequenceIndex.java +++ b/java/src/org/broadinstitute/sting/utils/fasta/FastaSequenceIndex.java @@ -1,6 +1,7 @@ package org.broadinstitute.sting.utils.fasta; import edu.mit.broad.picard.PicardException; +import edu.mit.broad.picard.io.IoUtil; import java.util.Scanner; import java.util.HashMap; @@ -29,6 +30,20 @@ public class FastaSequenceIndex { * @throws PicardException if file is of invalid format. */ public FastaSequenceIndex( File indexFile ) throws FileNotFoundException { + if(!indexFile.exists()) + throw new FileNotFoundException("Index file is missing"); + + IoUtil.assertFileIsReadable(indexFile); + parseIndexFile(indexFile); + } + + + /** + * Parse the contents of an index file, caching the results internally. + * @param indexFile File to parse. + * @throws FileNotFoundException Thrown if file could not be opened. + */ + private void parseIndexFile(File indexFile) throws FileNotFoundException { Scanner scanner = new Scanner(indexFile); while( scanner.hasNext() ) { diff --git a/java/src/org/broadinstitute/sting/utils/fasta/IndexedFastaSequenceFile.java b/java/src/org/broadinstitute/sting/utils/fasta/IndexedFastaSequenceFile.java new file mode 100755 index 000000000..291060fd5 --- /dev/null +++ b/java/src/org/broadinstitute/sting/utils/fasta/IndexedFastaSequenceFile.java @@ -0,0 +1,142 @@ +package org.broadinstitute.sting.utils.fasta; + +import edu.mit.broad.picard.reference.ReferenceSequenceFile; +import edu.mit.broad.picard.reference.ReferenceSequence; +import edu.mit.broad.picard.PicardException; + +import java.io.File; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.nio.channels.FileChannel; +import java.nio.ByteBuffer; +import java.nio.charset.CharsetDecoder; +import java.nio.charset.Charset; +import java.nio.charset.CharacterCodingException; +import java.util.Scanner; + +import net.sf.samtools.SAMSequenceDictionary; + +/** + * Created by IntelliJ IDEA. + * User: hanna + * Date: Apr 14, 2009 + * Time: 2:14:26 PM + * + * A fasta file driven by an index for fast, concurrent lookups. Supports two interfaces: + * the ReferenceSequenceFile for old-style, stateful lookups and a direct getter. + */ +public class IndexedFastaSequenceFile implements ReferenceSequenceFile { + // Using buffer size of 4k because that's what Picard uses; no thought went into this. + private static final int BUFFERSIZE = 4096; + + private final File file; + private FileInputStream in; + private FileChannel channel; + + private final FastaSequenceIndex index; + + private String currentContigName = null; + + public IndexedFastaSequenceFile(File file) throws FileNotFoundException { + this.file = file; + // TODO: Add support for gzipped files + in = new FileInputStream(file); + channel = in.getChannel(); + + File indexFile = new File(file.getAbsolutePath() + ".fai"); + index = new FastaSequenceIndex(indexFile); + } + + public SAMSequenceDictionary getSequenceDictionary() { + throw new UnsupportedOperationException("Indexed fasta files do not require dictionaries"); + } + + public ReferenceSequence getSequence( String contig ) { + return getSubsequenceAt( contig, 0, (int)index.getIndexEntry(contig).getSize() ); + } + + public ReferenceSequence getSubsequenceAt( String contig, int pos, int length ) { + FastaSequenceIndexEntry indexEntry = index.getIndexEntry(contig); + + final int basesPerLine = indexEntry.getBasesPerLine(); + + // Start reading at the closest start-of-line to our data. + long readStart = indexEntry.getLocation() + (pos / basesPerLine); + int dataOfInterestStart = pos % basesPerLine; + + byte[] accumulator = new byte[length]; + int nextAccumulatorSlot = 0; + + while(length > 0) { + ByteBuffer buffer = ByteBuffer.allocateDirect(BUFFERSIZE); + try { + channel.read(buffer, readStart); + readStart += BUFFERSIZE; + } + catch( IOException ex ) { + throw new PicardException("Unable to read directly from fasta", ex); + } + + final int basesTransferred = transferToBuffer( buffer, + dataOfInterestStart, + accumulator, + nextAccumulatorSlot, + length ); + + nextAccumulatorSlot += basesTransferred; + length -= basesTransferred; + dataOfInterestStart = 0; + } + + return new ReferenceSequence( contig, pos, accumulator ); + } + + /** + * Transfers the contents of the given ByteBuffer to the given byte array, discarding + * line breaks at regular intervals. Copies as many as length bases, depending on the + * buffer size. Returns the number of bytes actually copied. + * @param source The source ByteBuffer. + * @param sourceStart The starting position to copy within the byte buffer + * @param target Destination for the data + * @param targetStart Index into target buffer. + * @param length How much data to move. + * @return How many bytes were actually transferred. + */ + private int transferToBuffer( ByteBuffer source, + int sourceStart, + byte[] target, + int targetStart, + int length ) { + source.position(sourceStart); + int basesRead = 0; + CharsetDecoder decoder = Charset.forName("US-ASCII").newDecoder(); + + Scanner scanner = null; + try { + scanner = new Scanner(decoder.decode(source).toString()); + } + catch(CharacterCodingException ex) { + throw new PicardException("Malformed subsequence",ex); + } + + while( scanner.hasNext() && basesRead < length ) { + String sourceLine = scanner.nextLine(); + byte[] sourceData = sourceLine.getBytes(); + int basesToTransfer = Math.min(sourceData.length,length - basesRead); + System.arraycopy(sourceData,0,target,targetStart+basesRead,basesToTransfer); + + basesRead += basesToTransfer; + } + return basesRead; + } + + + public ReferenceSequence nextSequence() { + return getSubsequenceAt("chrM", 0, 20); + } + + public String toString() { + return this.file.getAbsolutePath(); + } +} diff --git a/java/test/org/broadinstitute/sting/utils/fasta/IndexedFastaSequenceFileTest.java b/java/test/org/broadinstitute/sting/utils/fasta/IndexedFastaSequenceFileTest.java new file mode 100755 index 000000000..3d80628d7 --- /dev/null +++ b/java/test/org/broadinstitute/sting/utils/fasta/IndexedFastaSequenceFileTest.java @@ -0,0 +1,111 @@ +package org.broadinstitute.sting.utils.fasta; + +import org.junit.BeforeClass; +import org.junit.Before; +import org.junit.Test; +import org.junit.Assert; +import org.broadinstitute.sting.BaseTest; + +import java.io.File; +import java.io.FileNotFoundException; + +import edu.mit.broad.picard.reference.ReferenceSequence; +import net.sf.samtools.util.StringUtil; + +/** + * Created by IntelliJ IDEA. + * User: hanna + * Date: Apr 14, 2009 + * Time: 2:37:29 PM + * To change this template use File | Settings | File Templates. + */ +public class IndexedFastaSequenceFileTest extends BaseTest { + private static String sequenceFileName; + private IndexedFastaSequenceFile sequenceFile = null; + + private final String firstBasesOfChrM = "GATCACAGGTCTATCACCCT"; + private final String extendedBasesOfChrM = "GATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCAT" + + "TTGGTATTTTCGTCTGGGGGGTGTGCACGCGATAGCATTGCGAGACGCTG" + + "GAGCCGGAGCACCCTATGTCGCAGTATCTGTCTTTGATTCCTGCCTCATT"; + private final String firstBasesOfChr1 = "taaccctaaccctaacccta"; + private final String firstBasesOfChr8 = "GCAATTATGACACAAAAAAT"; + + @BeforeClass + public static void initialize() { + sequenceFileName = seqLocation + "/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta"; + } + + @Before + public void doForEachTest() throws FileNotFoundException { + sequenceFile = new IndexedFastaSequenceFile( new File(sequenceFileName) ); + } + + @Test + public void testOpenFile() { + long startTime = System.currentTimeMillis(); + Assert.assertNotNull( sequenceFile ); + long endTime = System.currentTimeMillis(); + + System.err.printf("testOpenFile runtime: %dms%n", (endTime - startTime)) ; + } + + @Test + public void testFirstSequence() { + long startTime = System.currentTimeMillis(); + ReferenceSequence sequence = sequenceFile.nextSequence(); + Assert.assertEquals("First sequence contig is not correct", sequence.getName(), "chrM"); + Assert.assertEquals( "First n bases of chrM are incorrect", + firstBasesOfChrM, + StringUtil.bytesToString( sequence.getBases() ) ); + long endTime = System.currentTimeMillis(); + + System.err.printf("testFirstSequence runtime: %dms%n", (endTime - startTime)) ; + } + + @Test + public void testFirstSequenceExtended() { + long startTime = System.currentTimeMillis(); + ReferenceSequence sequence = sequenceFile.getSubsequenceAt("chrM",0,extendedBasesOfChrM.length()); + Assert.assertEquals("First sequence contig is not correct", sequence.getName(), "chrM"); + Assert.assertEquals( "First n bases of chrM are incorrect", + extendedBasesOfChrM.substring(0,110), + StringUtil.bytesToString( sequence.getBases(),0,110 ) ); + long endTime = System.currentTimeMillis(); + + System.err.printf("testFirstSequenceExtended runtime: %dms%n", (endTime - startTime)) ; + } + + @Test + public void testReadStartingInCenterOfLine() { + final int bytesToChopOff = 5; + String truncated = extendedBasesOfChrM.substring(bytesToChopOff); + + long startTime = System.currentTimeMillis(); + ReferenceSequence sequence = sequenceFile.getSubsequenceAt("chrM", bytesToChopOff ,truncated.length() ); + Assert.assertEquals("First sequence contig is not correct", sequence.getName(), "chrM"); + Assert.assertEquals( "First n bases of chrM are incorrect", + truncated, + StringUtil.bytesToString( sequence.getBases() ) ); + long endTime = System.currentTimeMillis(); + + System.err.printf("testReadStartingInCenterOfLine runtime: %dms%n", (endTime - startTime)) ; + } + + @Test + public void testCompleteContigRead() { + FastaSequenceFile2 originalSequenceFile = new FastaSequenceFile2(new File(sequenceFileName)); + ReferenceSequence expectedSequence = originalSequenceFile.nextSequence(); + + long startTime = System.currentTimeMillis(); + ReferenceSequence sequence = sequenceFile.getSequence("chrM"); + Assert.assertEquals("First sequence contig is not correct", sequence.getName(), "chrM"); + Assert.assertEquals("chrM is incorrect", + StringUtil.bytesToString(expectedSequence.getBases(),0,4096), + StringUtil.bytesToString(sequence.getBases(),0,4096) ); + long endTime = System.currentTimeMillis(); + + System.err.printf("testCompleteContigRead runtime: %dms%n", (endTime - startTime)) ; + } + + +}