gatk-3.8/java/src/org/broadinstitute/sting/utils/fasta/IndexedFastaSequenceFile.java

package org.broadinstitute.sting.utils.fasta;

import edu.mit.broad.picard.reference.ReferenceSequenceFile;
import edu.mit.broad.picard.reference.ReferenceSequence;
import edu.mit.broad.picard.PicardException;
import edu.mit.broad.picard.io.IoUtil;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.nio.channels.FileChannel;
import java.nio.ByteBuffer;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.Charset;
import java.nio.charset.CharacterCodingException;
import java.util.Scanner;
import java.util.Iterator;

import net.sf.samtools.SAMSequenceDictionary;
import net.sf.samtools.SAMTextHeaderCodec;
import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMSequenceRecord;
import net.sf.samtools.util.AsciiLineReader;

/**
 * Created by IntelliJ IDEA.
 * User: hanna
 * Date: Apr 14, 2009
 * Time: 2:14:26 PM
 *
 * A fasta file driven by an index for fast, concurrent lookups.  Supports two interfaces:
 * the ReferenceSequenceFile for old-style, stateful lookups and a direct getter.
 */
public class IndexedFastaSequenceFile implements ReferenceSequenceFile {
    // Using buffer size of 4k because that's what Picard uses; no thought went into this.
    private static final int BUFFERSIZE = 4096;

    private final File file;
    private FileInputStream in;
    private FileChannel channel;

    private SAMSequenceDictionary sequenceDictionary = null;    

    private FastaSequenceIndex index;
    private Iterator<FastaSequenceIndexEntry> indexIterator;

    public IndexedFastaSequenceFile(File file) throws FileNotFoundException {
        this.file = file;
        // TODO: Add support for gzipped files
        in = new FileInputStream(file);
        channel = in.getChannel();

        loadDictionary(file);
        loadIndex(file);
        sanityCheckDictionaryAgainstIndex();
    }

    /**
     * Loads a dictionary, if available.
     * @param fastaFile File to check for a match.
     * TODO: This code is copied directly from FastaSequenceFile / FastaSequenceFile2.  Bring it into a shared utility.
     */
    private void loadDictionary( File fastaFile ) {
        // Try and locate the dictionary
        String dictionaryName = fastaFile.getAbsolutePath();
        dictionaryName = dictionaryName.substring(0, dictionaryName.lastIndexOf(".fasta"));
        dictionaryName += ".dict";
        final File dictionary = new File(dictionaryName);
        if (dictionary.exists()) {
            IoUtil.assertFileIsReadable(dictionary);

            try {
                final SAMTextHeaderCodec codec = new SAMTextHeaderCodec();
                final SAMFileHeader header = codec.decode(new AsciiLineReader(new FileInputStream(dictionary)), dictionary);
                if (header.getSequenceDictionary() != null && header.getSequenceDictionary().size() > 0) {
                    this.sequenceDictionary = header.getSequenceDictionary();
                }
            }
            catch (Exception e) {
                throw new PicardException("Could not open sequence dictionary file: " + dictionaryName, e);
            }
        }

    }

    /**
     * Loads the index for the fasta, if present.  Throws an exception if now present.
     */
    private void loadIndex( File fastaFile ) throws FileNotFoundException {
        File indexFile = new File(fastaFile.getAbsolutePath() + ".fai");
        index = new FastaSequenceIndex(indexFile);
        indexIterator = index.iterator();
    }

    /**
     * Do some basic checking to make sure the dictionary and the index match.
     */
    private void sanityCheckDictionaryAgainstIndex() {
        // Make sure dictionary and index are the same size.
        if( sequenceDictionary.getSequences().size() != index.size() )
            throw new PicardException("Sequence dictionary and index contain different numbers of contigs");

        for( SAMSequenceRecord sequenceRecord: sequenceDictionary.getSequences() ) {
            // Make sure sequence name is present in the index.
            String sequenceName = sequenceRecord.getSequenceName();
            if( !index.hasIndexEntry(sequenceName) )
                throw new PicardException("Index does not contain dictionary entry: " + sequenceName );

            // Make sure sequence length matches index length.
            if( sequenceRecord.getSequenceLength() != index.getIndexEntry(sequenceName).getSize())
                throw new PicardException("Index length does not match dictionary length for contig: " + sequenceName );
        }
    }

    public SAMSequenceDictionary getSequenceDictionary() {
        return sequenceDictionary;
    }

    public ReferenceSequence getSequence( String contig ) {
        return getSubsequenceAt( contig, 0, (int)index.getIndexEntry(contig).getSize() );
    }

    public ReferenceSequence getSubsequenceAt( String contig, int pos, int length ) {
        FastaSequenceIndexEntry indexEntry = index.getIndexEntry(contig);

        if(pos + length - 1 > indexEntry.getSize())
            throw new PicardException("Query asks for data past end of contig");

        final int basesPerLine = indexEntry.getBasesPerLine();
        final int bytesPerLine = indexEntry.getBytesPerLine();

        // Start reading at the closest start-of-line to our data.
        long readStart = indexEntry.getLocation() + (pos / basesPerLine) * bytesPerLine;
        int dataOfInterestStart = pos % basesPerLine;

        byte[] accumulator = new byte[length];
        int nextAccumulatorSlot = 0;        

        while(length > 0) {
            ByteBuffer buffer = ByteBuffer.allocateDirect(BUFFERSIZE);
            try {
                channel.read(buffer, readStart);
                readStart += BUFFERSIZE;
            }
            catch( IOException ex ) {
                throw new PicardException("Unable to read directly from fasta", ex);
            }

            final int basesTransferred = transferToBuffer( buffer,
                                                           dataOfInterestStart,
                                                           accumulator,
                                                           nextAccumulatorSlot,
                                                           length );

            nextAccumulatorSlot += basesTransferred;
            length -= basesTransferred;
            dataOfInterestStart = 0;
        }

        return new ReferenceSequence( contig, pos, accumulator );
    }

    /**
     * Transfers the contents of the given ByteBuffer to the given byte array, discarding
     * line breaks at regular intervals.  Copies as many as length bases, depending on the
     * buffer size.  Returns the number of bytes actually copied.
     * @param source The source ByteBuffer.
     * @param sourceStart The starting position to copy within the byte buffer
     * @param target Destination for the data
     * @param targetStart Index into target buffer.
     * @param length How much data to move.
     * @return How many bytes were actually transferred.
     */
    private int transferToBuffer( ByteBuffer source,
                                  int sourceStart,
                                  byte[] target,
                                  int targetStart,
                                  int length ) {
        source.position(sourceStart);
        int basesRead = 0;
        CharsetDecoder decoder = Charset.forName("US-ASCII").newDecoder();

        Scanner scanner = null;
        try {
            scanner = new Scanner(decoder.decode(source).toString());
        }
        catch(CharacterCodingException ex) {
            throw new PicardException("Malformed subsequence",ex);
        }

        while( scanner.hasNext() && basesRead < length ) {
            String sourceLine = scanner.nextLine();
            byte[] sourceData = sourceLine.getBytes();
            int basesToTransfer = Math.min(sourceData.length,length - basesRead);
            System.arraycopy(sourceData,0,target,targetStart+basesRead,basesToTransfer);

            basesRead += basesToTransfer;
        }
        return basesRead;
    }

    /**
     * Gets the next sequence if available, or null if not present.
     * @return next sequence if available, or null if not present.
     */
    public ReferenceSequence nextSequence() {
        if( !indexIterator.hasNext() )
            return null;
        return getSequence( indexIterator.next().getContig() );
    }

    public String toString() {
        return this.file.getAbsolutePath();
    }
}
Basic indexed fasta POC in place. Requires a more complete implementation of the ReferenceSequenceFile interface, and much more testing. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@425 348d0f76-0448-11de-a6fe-93d51630548a 2009-04-15 21:46:56 +08:00			`package org.broadinstitute.sting.utils.fasta;`

			`import edu.mit.broad.picard.reference.ReferenceSequenceFile;`
			`import edu.mit.broad.picard.reference.ReferenceSequence;`
			`import edu.mit.broad.picard.PicardException;`
Load the dictionary and sanity check it against the index. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@430 348d0f76-0448-11de-a6fe-93d51630548a 2009-04-16 02:04:13 +08:00			`import edu.mit.broad.picard.io.IoUtil;`
Basic indexed fasta POC in place. Requires a more complete implementation of the ReferenceSequenceFile interface, and much more testing. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@425 348d0f76-0448-11de-a6fe-93d51630548a 2009-04-15 21:46:56 +08:00
			`import java.io.File;`
			`import java.io.FileInputStream;`
			`import java.io.FileNotFoundException;`
			`import java.io.IOException;`
			`import java.nio.channels.FileChannel;`
			`import java.nio.ByteBuffer;`
			`import java.nio.charset.CharsetDecoder;`
			`import java.nio.charset.Charset;`
			`import java.nio.charset.CharacterCodingException;`
			`import java.util.Scanner;`
Added index iteration for ReferenceSequenceFile interface compatibility. Added better error checking for querying past the end of a contig. Lots more testing. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@429 348d0f76-0448-11de-a6fe-93d51630548a 2009-04-16 01:17:11 +08:00			`import java.util.Iterator;`
Basic indexed fasta POC in place. Requires a more complete implementation of the ReferenceSequenceFile interface, and much more testing. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@425 348d0f76-0448-11de-a6fe-93d51630548a 2009-04-15 21:46:56 +08:00
			`import net.sf.samtools.SAMSequenceDictionary;`
Load the dictionary and sanity check it against the index. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@430 348d0f76-0448-11de-a6fe-93d51630548a 2009-04-16 02:04:13 +08:00			`import net.sf.samtools.SAMTextHeaderCodec;`
			`import net.sf.samtools.SAMFileHeader;`
			`import net.sf.samtools.SAMSequenceRecord;`
			`import net.sf.samtools.util.AsciiLineReader;`
Basic indexed fasta POC in place. Requires a more complete implementation of the ReferenceSequenceFile interface, and much more testing. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@425 348d0f76-0448-11de-a6fe-93d51630548a 2009-04-15 21:46:56 +08:00
			`/**`
			`* Created by IntelliJ IDEA.`
			`* User: hanna`
			`* Date: Apr 14, 2009`
			`* Time: 2:14:26 PM`
			`*`
			`* A fasta file driven by an index for fast, concurrent lookups. Supports two interfaces:`
			`* the ReferenceSequenceFile for old-style, stateful lookups and a direct getter.`
			`*/`
			`public class IndexedFastaSequenceFile implements ReferenceSequenceFile {`
			`// Using buffer size of 4k because that's what Picard uses; no thought went into this.`
			`private static final int BUFFERSIZE = 4096;`

			`private final File file;`
			`private FileInputStream in;`
			`private FileChannel channel;`

Load the dictionary and sanity check it against the index. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@430 348d0f76-0448-11de-a6fe-93d51630548a 2009-04-16 02:04:13 +08:00			`private SAMSequenceDictionary sequenceDictionary = null;`

			`private FastaSequenceIndex index;`
			`private Iterator<FastaSequenceIndexEntry> indexIterator;`
Basic indexed fasta POC in place. Requires a more complete implementation of the ReferenceSequenceFile interface, and much more testing. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@425 348d0f76-0448-11de-a6fe-93d51630548a 2009-04-15 21:46:56 +08:00
			`public IndexedFastaSequenceFile(File file) throws FileNotFoundException {`
			`this.file = file;`
			`// TODO: Add support for gzipped files`
			`in = new FileInputStream(file);`
			`channel = in.getChannel();`

Load the dictionary and sanity check it against the index. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@430 348d0f76-0448-11de-a6fe-93d51630548a 2009-04-16 02:04:13 +08:00			`loadDictionary(file);`
			`loadIndex(file);`
			`sanityCheckDictionaryAgainstIndex();`
			`}`

			`/**`
			`* Loads a dictionary, if available.`
			`* @param fastaFile File to check for a match.`
			`* TODO: This code is copied directly from FastaSequenceFile / FastaSequenceFile2. Bring it into a shared utility.`
			`*/`
			`private void loadDictionary( File fastaFile ) {`
			`// Try and locate the dictionary`
			`String dictionaryName = fastaFile.getAbsolutePath();`
			`dictionaryName = dictionaryName.substring(0, dictionaryName.lastIndexOf(".fasta"));`
			`dictionaryName += ".dict";`
			`final File dictionary = new File(dictionaryName);`
			`if (dictionary.exists()) {`
			`IoUtil.assertFileIsReadable(dictionary);`

			`try {`
			`final SAMTextHeaderCodec codec = new SAMTextHeaderCodec();`
			`final SAMFileHeader header = codec.decode(new AsciiLineReader(new FileInputStream(dictionary)), dictionary);`
			`if (header.getSequenceDictionary() != null && header.getSequenceDictionary().size() > 0) {`
			`this.sequenceDictionary = header.getSequenceDictionary();`
			`}`
			`}`
			`catch (Exception e) {`
			`throw new PicardException("Could not open sequence dictionary file: " + dictionaryName, e);`
			`}`
			`}`

			`}`

			`/**`
			`* Loads the index for the fasta, if present. Throws an exception if now present.`
			`*/`
			`private void loadIndex( File fastaFile ) throws FileNotFoundException {`
			`File indexFile = new File(fastaFile.getAbsolutePath() + ".fai");`
Basic indexed fasta POC in place. Requires a more complete implementation of the ReferenceSequenceFile interface, and much more testing. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@425 348d0f76-0448-11de-a6fe-93d51630548a 2009-04-15 21:46:56 +08:00			`index = new FastaSequenceIndex(indexFile);`
Added index iteration for ReferenceSequenceFile interface compatibility. Added better error checking for querying past the end of a contig. Lots more testing. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@429 348d0f76-0448-11de-a6fe-93d51630548a 2009-04-16 01:17:11 +08:00			`indexIterator = index.iterator();`
Basic indexed fasta POC in place. Requires a more complete implementation of the ReferenceSequenceFile interface, and much more testing. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@425 348d0f76-0448-11de-a6fe-93d51630548a 2009-04-15 21:46:56 +08:00			`}`

Load the dictionary and sanity check it against the index. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@430 348d0f76-0448-11de-a6fe-93d51630548a 2009-04-16 02:04:13 +08:00			`/**`
			`* Do some basic checking to make sure the dictionary and the index match.`
			`*/`
			`private void sanityCheckDictionaryAgainstIndex() {`
			`// Make sure dictionary and index are the same size.`
			`if( sequenceDictionary.getSequences().size() != index.size() )`
			`throw new PicardException("Sequence dictionary and index contain different numbers of contigs");`

			`for( SAMSequenceRecord sequenceRecord: sequenceDictionary.getSequences() ) {`
			`// Make sure sequence name is present in the index.`
			`String sequenceName = sequenceRecord.getSequenceName();`
			`if( !index.hasIndexEntry(sequenceName) )`
			`throw new PicardException("Index does not contain dictionary entry: " + sequenceName );`

			`// Make sure sequence length matches index length.`
			`if( sequenceRecord.getSequenceLength() != index.getIndexEntry(sequenceName).getSize())`
			`throw new PicardException("Index length does not match dictionary length for contig: " + sequenceName );`
			`}`
			`}`

Basic indexed fasta POC in place. Requires a more complete implementation of the ReferenceSequenceFile interface, and much more testing. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@425 348d0f76-0448-11de-a6fe-93d51630548a 2009-04-15 21:46:56 +08:00			`public SAMSequenceDictionary getSequenceDictionary() {`
Load the dictionary and sanity check it against the index. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@430 348d0f76-0448-11de-a6fe-93d51630548a 2009-04-16 02:04:13 +08:00			`return sequenceDictionary;`
Basic indexed fasta POC in place. Requires a more complete implementation of the ReferenceSequenceFile interface, and much more testing. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@425 348d0f76-0448-11de-a6fe-93d51630548a 2009-04-15 21:46:56 +08:00			`}`

			`public ReferenceSequence getSequence( String contig ) {`
			`return getSubsequenceAt( contig, 0, (int)index.getIndexEntry(contig).getSize() );`
			`}`

			`public ReferenceSequence getSubsequenceAt( String contig, int pos, int length ) {`
			`FastaSequenceIndexEntry indexEntry = index.getIndexEntry(contig);`

Added index iteration for ReferenceSequenceFile interface compatibility. Added better error checking for querying past the end of a contig. Lots more testing. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@429 348d0f76-0448-11de-a6fe-93d51630548a 2009-04-16 01:17:11 +08:00			`if(pos + length - 1 > indexEntry.getSize())`
			`throw new PicardException("Query asks for data past end of contig");`

Basic indexed fasta POC in place. Requires a more complete implementation of the ReferenceSequenceFile interface, and much more testing. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@425 348d0f76-0448-11de-a6fe-93d51630548a 2009-04-15 21:46:56 +08:00			`final int basesPerLine = indexEntry.getBasesPerLine();`
Added index iteration for ReferenceSequenceFile interface compatibility. Added better error checking for querying past the end of a contig. Lots more testing. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@429 348d0f76-0448-11de-a6fe-93d51630548a 2009-04-16 01:17:11 +08:00			`final int bytesPerLine = indexEntry.getBytesPerLine();`
Basic indexed fasta POC in place. Requires a more complete implementation of the ReferenceSequenceFile interface, and much more testing. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@425 348d0f76-0448-11de-a6fe-93d51630548a 2009-04-15 21:46:56 +08:00
			`// Start reading at the closest start-of-line to our data.`
Added index iteration for ReferenceSequenceFile interface compatibility. Added better error checking for querying past the end of a contig. Lots more testing. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@429 348d0f76-0448-11de-a6fe-93d51630548a 2009-04-16 01:17:11 +08:00			`long readStart = indexEntry.getLocation() + (pos / basesPerLine) * bytesPerLine;`
Basic indexed fasta POC in place. Requires a more complete implementation of the ReferenceSequenceFile interface, and much more testing. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@425 348d0f76-0448-11de-a6fe-93d51630548a 2009-04-15 21:46:56 +08:00			`int dataOfInterestStart = pos % basesPerLine;`

			`byte[] accumulator = new byte[length];`
			`int nextAccumulatorSlot = 0;`

			`while(length > 0) {`
			`ByteBuffer buffer = ByteBuffer.allocateDirect(BUFFERSIZE);`
			`try {`
			`channel.read(buffer, readStart);`
			`readStart += BUFFERSIZE;`
			`}`
			`catch( IOException ex ) {`
			`throw new PicardException("Unable to read directly from fasta", ex);`
			`}`

			`final int basesTransferred = transferToBuffer( buffer,`
			`dataOfInterestStart,`
			`accumulator,`
			`nextAccumulatorSlot,`
			`length );`

			`nextAccumulatorSlot += basesTransferred;`
			`length -= basesTransferred;`
			`dataOfInterestStart = 0;`
			`}`

			`return new ReferenceSequence( contig, pos, accumulator );`
			`}`

			`/**`
			`* Transfers the contents of the given ByteBuffer to the given byte array, discarding`
			`* line breaks at regular intervals. Copies as many as length bases, depending on the`
			`* buffer size. Returns the number of bytes actually copied.`
			`* @param source The source ByteBuffer.`
			`* @param sourceStart The starting position to copy within the byte buffer`
			`* @param target Destination for the data`
			`* @param targetStart Index into target buffer.`
			`* @param length How much data to move.`
			`* @return How many bytes were actually transferred.`
			`*/`
			`private int transferToBuffer( ByteBuffer source,`
			`int sourceStart,`
			`byte[] target,`
			`int targetStart,`
			`int length ) {`
			`source.position(sourceStart);`
			`int basesRead = 0;`
			`CharsetDecoder decoder = Charset.forName("US-ASCII").newDecoder();`

			`Scanner scanner = null;`
			`try {`
			`scanner = new Scanner(decoder.decode(source).toString());`
			`}`
			`catch(CharacterCodingException ex) {`
			`throw new PicardException("Malformed subsequence",ex);`
			`}`

			`while( scanner.hasNext() && basesRead < length ) {`
			`String sourceLine = scanner.nextLine();`
			`byte[] sourceData = sourceLine.getBytes();`
			`int basesToTransfer = Math.min(sourceData.length,length - basesRead);`
			`System.arraycopy(sourceData,0,target,targetStart+basesRead,basesToTransfer);`

			`basesRead += basesToTransfer;`
			`}`
			`return basesRead;`
			`}`

Added index iteration for ReferenceSequenceFile interface compatibility. Added better error checking for querying past the end of a contig. Lots more testing. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@429 348d0f76-0448-11de-a6fe-93d51630548a 2009-04-16 01:17:11 +08:00			`/**`
			`* Gets the next sequence if available, or null if not present.`
			`* @return next sequence if available, or null if not present.`
			`*/`
Basic indexed fasta POC in place. Requires a more complete implementation of the ReferenceSequenceFile interface, and much more testing. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@425 348d0f76-0448-11de-a6fe-93d51630548a 2009-04-15 21:46:56 +08:00			`public ReferenceSequence nextSequence() {`
Added index iteration for ReferenceSequenceFile interface compatibility. Added better error checking for querying past the end of a contig. Lots more testing. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@429 348d0f76-0448-11de-a6fe-93d51630548a 2009-04-16 01:17:11 +08:00			`if( !indexIterator.hasNext() )`
			`return null;`
			`return getSequence( indexIterator.next().getContig() );`
Basic indexed fasta POC in place. Requires a more complete implementation of the ReferenceSequenceFile interface, and much more testing. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@425 348d0f76-0448-11de-a6fe-93d51630548a 2009-04-15 21:46:56 +08:00			`}`

			`public String toString() {`
			`return this.file.getAbsolutePath();`
			`}`
			`}`