Basic indexed fasta POC in place. Requires a more complete implementation of the ReferenceSequenceFile interface,

and much more testing. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@425 348d0f76-0448-11de-a6fe-93d51630548a
2009-04-15 13:46:56 +00:00 · 2009-04-15 13:46:56 +00:00 · 182626576f
parent 7949e377e4
commit 182626576f
3 changed files with 268 additions and 0 deletions
--- a/java/src/org/broadinstitute/sting/utils/fasta/FastaSequenceIndex.java
+++ b/java/src/org/broadinstitute/sting/utils/fasta/FastaSequenceIndex.java
@ -1,6 +1,7 @@
 package org.broadinstitute.sting.utils.fasta;

 import edu.mit.broad.picard.PicardException;
+import edu.mit.broad.picard.io.IoUtil;

 import java.util.Scanner;
 import java.util.HashMap;
@ -29,6 +30,20 @@ public class FastaSequenceIndex {
     * @throws PicardException if file is of invalid format.
     */
    public FastaSequenceIndex( File indexFile ) throws FileNotFoundException {
+        if(!indexFile.exists())
+            throw new FileNotFoundException("Index file is missing");
+
+        IoUtil.assertFileIsReadable(indexFile);
+        parseIndexFile(indexFile);
+    }
+
+
+    /**
+     * Parse the contents of an index file, caching the results internally.
+     * @param indexFile File to parse.
+     * @throws FileNotFoundException Thrown if file could not be opened.
+     */
+    private void parseIndexFile(File indexFile) throws FileNotFoundException {
        Scanner scanner = new Scanner(indexFile);

        while( scanner.hasNext() ) {
--- a/java/src/org/broadinstitute/sting/utils/fasta/IndexedFastaSequenceFile.java
+++ b/java/src/org/broadinstitute/sting/utils/fasta/IndexedFastaSequenceFile.java
@ -0,0 +1,142 @@
+package org.broadinstitute.sting.utils.fasta;
+
+import edu.mit.broad.picard.reference.ReferenceSequenceFile;
+import edu.mit.broad.picard.reference.ReferenceSequence;
+import edu.mit.broad.picard.PicardException;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.nio.channels.FileChannel;
+import java.nio.ByteBuffer;
+import java.nio.charset.CharsetDecoder;
+import java.nio.charset.Charset;
+import java.nio.charset.CharacterCodingException;
+import java.util.Scanner;
+
+import net.sf.samtools.SAMSequenceDictionary;
+
+/**
+ * Created by IntelliJ IDEA.
+ * User: hanna
+ * Date: Apr 14, 2009
+ * Time: 2:14:26 PM
+ *
+ * A fasta file driven by an index for fast, concurrent lookups.  Supports two interfaces:
+ * the ReferenceSequenceFile for old-style, stateful lookups and a direct getter.
+ */
+public class IndexedFastaSequenceFile implements ReferenceSequenceFile {
+    // Using buffer size of 4k because that's what Picard uses; no thought went into this.
+    private static final int BUFFERSIZE = 4096;
+
+    private final File file;
+    private FileInputStream in;
+    private FileChannel channel;
+
+    private final FastaSequenceIndex index;
+
+    private String currentContigName = null;
+
+    public IndexedFastaSequenceFile(File file) throws FileNotFoundException {
+        this.file = file;
+        // TODO: Add support for gzipped files
+        in = new FileInputStream(file);
+        channel = in.getChannel();
+
+        File indexFile = new File(file.getAbsolutePath() + ".fai");
+        index = new FastaSequenceIndex(indexFile);
+    }
+
+    public SAMSequenceDictionary getSequenceDictionary() {
+        throw new UnsupportedOperationException("Indexed fasta files do not require dictionaries");
+    }
+
+    public ReferenceSequence getSequence( String contig ) {
+        return getSubsequenceAt( contig, 0, (int)index.getIndexEntry(contig).getSize() );
+    }
+
+    public ReferenceSequence getSubsequenceAt( String contig, int pos, int length ) {
+        FastaSequenceIndexEntry indexEntry = index.getIndexEntry(contig);
+
+        final int basesPerLine = indexEntry.getBasesPerLine();
+
+        // Start reading at the closest start-of-line to our data.
+        long readStart = indexEntry.getLocation() + (pos / basesPerLine);
+        int dataOfInterestStart = pos % basesPerLine;
+
+        byte[] accumulator = new byte[length];
+        int nextAccumulatorSlot = 0;        
+
+        while(length > 0) {
+            ByteBuffer buffer = ByteBuffer.allocateDirect(BUFFERSIZE);
+            try {
+                channel.read(buffer, readStart);
+                readStart += BUFFERSIZE;
+            }
+            catch( IOException ex ) {
+                throw new PicardException("Unable to read directly from fasta", ex);
+            }
+
+            final int basesTransferred = transferToBuffer( buffer,
+                                                           dataOfInterestStart,
+                                                           accumulator,
+                                                           nextAccumulatorSlot,
+                                                           length );
+
+            nextAccumulatorSlot += basesTransferred;
+            length -= basesTransferred;
+            dataOfInterestStart = 0;
+        }
+
+        return new ReferenceSequence( contig, pos, accumulator );
+    }
+
+    /**
+     * Transfers the contents of the given ByteBuffer to the given byte array, discarding
+     * line breaks at regular intervals.  Copies as many as length bases, depending on the
+     * buffer size.  Returns the number of bytes actually copied.
+     * @param source The source ByteBuffer.
+     * @param sourceStart The starting position to copy within the byte buffer
+     * @param target Destination for the data
+     * @param targetStart Index into target buffer.
+     * @param length How much data to move.
+     * @return How many bytes were actually transferred.
+     */
+    private int transferToBuffer( ByteBuffer source,
+                                  int sourceStart,
+                                  byte[] target,
+                                  int targetStart,
+                                  int length ) {
+        source.position(sourceStart);
+        int basesRead = 0;
+        CharsetDecoder decoder = Charset.forName("US-ASCII").newDecoder();
+
+        Scanner scanner = null;
+        try {
+            scanner = new Scanner(decoder.decode(source).toString());
+        }
+        catch(CharacterCodingException ex) {
+            throw new PicardException("Malformed subsequence",ex);
+        }
+
+        while( scanner.hasNext() && basesRead < length ) {
+            String sourceLine = scanner.nextLine();
+            byte[] sourceData = sourceLine.getBytes();
+            int basesToTransfer = Math.min(sourceData.length,length - basesRead);
+            System.arraycopy(sourceData,0,target,targetStart+basesRead,basesToTransfer);
+
+            basesRead += basesToTransfer;
+        }
+        return basesRead;
+    }
+
+
+    public ReferenceSequence nextSequence() {
+        return getSubsequenceAt("chrM", 0, 20);
+    }
+
+    public String toString() {
+        return this.file.getAbsolutePath();
+    }
+}
--- a/java/test/org/broadinstitute/sting/utils/fasta/IndexedFastaSequenceFileTest.java
+++ b/java/test/org/broadinstitute/sting/utils/fasta/IndexedFastaSequenceFileTest.java
@ -0,0 +1,111 @@
+package org.broadinstitute.sting.utils.fasta;
+
+import org.junit.BeforeClass;
+import org.junit.Before;
+import org.junit.Test;
+import org.junit.Assert;
+import org.broadinstitute.sting.BaseTest;
+
+import java.io.File;
+import java.io.FileNotFoundException;
+
+import edu.mit.broad.picard.reference.ReferenceSequence;
+import net.sf.samtools.util.StringUtil;
+
+/**
+ * Created by IntelliJ IDEA.
+ * User: hanna
+ * Date: Apr 14, 2009
+ * Time: 2:37:29 PM
+ * To change this template use File | Settings | File Templates.
+ */
+public class IndexedFastaSequenceFileTest extends BaseTest {
+    private static String sequenceFileName;
+    private IndexedFastaSequenceFile sequenceFile = null;
+
+    private final String firstBasesOfChrM = "GATCACAGGTCTATCACCCT";
+    private final String extendedBasesOfChrM = "GATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCAT" +
+                                               "TTGGTATTTTCGTCTGGGGGGTGTGCACGCGATAGCATTGCGAGACGCTG" +
+                                               "GAGCCGGAGCACCCTATGTCGCAGTATCTGTCTTTGATTCCTGCCTCATT";    
+    private final String firstBasesOfChr1 = "taaccctaaccctaacccta";
+    private final String firstBasesOfChr8 = "GCAATTATGACACAAAAAAT";
+
+    @BeforeClass
+    public static void initialize() {
+         sequenceFileName = seqLocation + "/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta";
+    }
+
+    @Before
+    public void doForEachTest() throws FileNotFoundException {
+        sequenceFile = new IndexedFastaSequenceFile( new File(sequenceFileName) );
+    }
+
+    @Test
+    public void testOpenFile() {
+        long startTime = System.currentTimeMillis();
+        Assert.assertNotNull( sequenceFile );
+        long endTime = System.currentTimeMillis();
+
+        System.err.printf("testOpenFile runtime: %dms%n", (endTime - startTime)) ;
+    }
+
+    @Test
+    public void testFirstSequence() {
+        long startTime = System.currentTimeMillis();
+        ReferenceSequence sequence = sequenceFile.nextSequence();
+        Assert.assertEquals("First sequence contig is not correct", sequence.getName(), "chrM");
+        Assert.assertEquals( "First n bases of chrM are incorrect",
+                             firstBasesOfChrM,
+                             StringUtil.bytesToString( sequence.getBases() ) );
+        long endTime = System.currentTimeMillis();
+
+        System.err.printf("testFirstSequence runtime: %dms%n", (endTime - startTime)) ;
+    }
+
+    @Test
+    public void testFirstSequenceExtended() {
+        long startTime = System.currentTimeMillis();
+        ReferenceSequence sequence = sequenceFile.getSubsequenceAt("chrM",0,extendedBasesOfChrM.length());
+        Assert.assertEquals("First sequence contig is not correct", sequence.getName(), "chrM");
+        Assert.assertEquals( "First n bases of chrM are incorrect",
+                             extendedBasesOfChrM.substring(0,110),
+                             StringUtil.bytesToString( sequence.getBases(),0,110 ) );
+        long endTime = System.currentTimeMillis();
+
+        System.err.printf("testFirstSequenceExtended runtime: %dms%n", (endTime - startTime)) ;
+    }
+
+    @Test
+    public void testReadStartingInCenterOfLine() {
+        final int bytesToChopOff = 5;
+        String truncated = extendedBasesOfChrM.substring(bytesToChopOff);
+
+        long startTime = System.currentTimeMillis();
+        ReferenceSequence sequence = sequenceFile.getSubsequenceAt("chrM", bytesToChopOff ,truncated.length() );
+        Assert.assertEquals("First sequence contig is not correct", sequence.getName(), "chrM");
+        Assert.assertEquals( "First n bases of chrM are incorrect",
+                             truncated,
+                             StringUtil.bytesToString( sequence.getBases() ) );
+        long endTime = System.currentTimeMillis();
+
+        System.err.printf("testReadStartingInCenterOfLine runtime: %dms%n", (endTime - startTime)) ;
+    }
+
+    @Test
+    public void testCompleteContigRead() {
+        FastaSequenceFile2 originalSequenceFile = new FastaSequenceFile2(new File(sequenceFileName));
+        ReferenceSequence expectedSequence = originalSequenceFile.nextSequence();
+
+        long startTime = System.currentTimeMillis();
+        ReferenceSequence sequence = sequenceFile.getSequence("chrM");        
+        Assert.assertEquals("First sequence contig is not correct", sequence.getName(), "chrM");
+        Assert.assertEquals("chrM is incorrect",
+                            StringUtil.bytesToString(expectedSequence.getBases(),0,4096),
+                            StringUtil.bytesToString(sequence.getBases(),0,4096) );
+        long endTime = System.currentTimeMillis();
+
+        System.err.printf("testCompleteContigRead runtime: %dms%n", (endTime - startTime)) ;        
+    }
+
+
+}