2009-04-15 21:46:56 +08:00
|
|
|
package org.broadinstitute.sting.utils.fasta;
|
|
|
|
|
|
|
|
|
|
import edu.mit.broad.picard.reference.ReferenceSequenceFile;
|
|
|
|
|
import edu.mit.broad.picard.reference.ReferenceSequence;
|
|
|
|
|
import edu.mit.broad.picard.PicardException;
|
2009-04-16 02:04:13 +08:00
|
|
|
import edu.mit.broad.picard.io.IoUtil;
|
2009-04-15 21:46:56 +08:00
|
|
|
|
|
|
|
|
import java.io.File;
|
|
|
|
|
import java.io.FileInputStream;
|
|
|
|
|
import java.io.FileNotFoundException;
|
|
|
|
|
import java.io.IOException;
|
|
|
|
|
import java.nio.channels.FileChannel;
|
|
|
|
|
import java.nio.ByteBuffer;
|
|
|
|
|
import java.nio.charset.CharsetDecoder;
|
|
|
|
|
import java.nio.charset.Charset;
|
|
|
|
|
import java.nio.charset.CharacterCodingException;
|
|
|
|
|
import java.util.Scanner;
|
2009-04-16 01:17:11 +08:00
|
|
|
import java.util.Iterator;
|
2009-04-15 21:46:56 +08:00
|
|
|
|
|
|
|
|
import net.sf.samtools.SAMSequenceDictionary;
|
2009-04-16 02:04:13 +08:00
|
|
|
import net.sf.samtools.SAMTextHeaderCodec;
|
|
|
|
|
import net.sf.samtools.SAMFileHeader;
|
|
|
|
|
import net.sf.samtools.SAMSequenceRecord;
|
|
|
|
|
import net.sf.samtools.util.AsciiLineReader;
|
2009-04-15 21:46:56 +08:00
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Created by IntelliJ IDEA.
|
|
|
|
|
* User: hanna
|
|
|
|
|
* Date: Apr 14, 2009
|
|
|
|
|
* Time: 2:14:26 PM
|
|
|
|
|
*
|
|
|
|
|
* A fasta file driven by an index for fast, concurrent lookups. Supports two interfaces:
|
|
|
|
|
* the ReferenceSequenceFile for old-style, stateful lookups and a direct getter.
|
|
|
|
|
*/
|
|
|
|
|
public class IndexedFastaSequenceFile implements ReferenceSequenceFile {
|
|
|
|
|
// Using buffer size of 4k because that's what Picard uses; no thought went into this.
|
|
|
|
|
private static final int BUFFERSIZE = 4096;
|
|
|
|
|
|
|
|
|
|
private final File file;
|
|
|
|
|
private FileInputStream in;
|
|
|
|
|
private FileChannel channel;
|
|
|
|
|
|
2009-04-16 02:04:13 +08:00
|
|
|
private SAMSequenceDictionary sequenceDictionary = null;
|
|
|
|
|
|
|
|
|
|
private FastaSequenceIndex index;
|
|
|
|
|
private Iterator<FastaSequenceIndexEntry> indexIterator;
|
2009-04-15 21:46:56 +08:00
|
|
|
|
|
|
|
|
public IndexedFastaSequenceFile(File file) throws FileNotFoundException {
|
|
|
|
|
this.file = file;
|
|
|
|
|
// TODO: Add support for gzipped files
|
|
|
|
|
in = new FileInputStream(file);
|
|
|
|
|
channel = in.getChannel();
|
|
|
|
|
|
2009-04-16 02:04:13 +08:00
|
|
|
loadDictionary(file);
|
|
|
|
|
loadIndex(file);
|
|
|
|
|
sanityCheckDictionaryAgainstIndex();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Loads a dictionary, if available.
|
|
|
|
|
* @param fastaFile File to check for a match.
|
|
|
|
|
* TODO: This code is copied directly from FastaSequenceFile / FastaSequenceFile2. Bring it into a shared utility.
|
|
|
|
|
*/
|
|
|
|
|
private void loadDictionary( File fastaFile ) {
|
|
|
|
|
// Try and locate the dictionary
|
|
|
|
|
String dictionaryName = fastaFile.getAbsolutePath();
|
|
|
|
|
dictionaryName = dictionaryName.substring(0, dictionaryName.lastIndexOf(".fasta"));
|
|
|
|
|
dictionaryName += ".dict";
|
|
|
|
|
final File dictionary = new File(dictionaryName);
|
|
|
|
|
if (dictionary.exists()) {
|
|
|
|
|
IoUtil.assertFileIsReadable(dictionary);
|
|
|
|
|
|
|
|
|
|
try {
|
|
|
|
|
final SAMTextHeaderCodec codec = new SAMTextHeaderCodec();
|
|
|
|
|
final SAMFileHeader header = codec.decode(new AsciiLineReader(new FileInputStream(dictionary)), dictionary);
|
|
|
|
|
if (header.getSequenceDictionary() != null && header.getSequenceDictionary().size() > 0) {
|
|
|
|
|
this.sequenceDictionary = header.getSequenceDictionary();
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
catch (Exception e) {
|
|
|
|
|
throw new PicardException("Could not open sequence dictionary file: " + dictionaryName, e);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Loads the index for the fasta, if present. Throws an exception if now present.
|
|
|
|
|
*/
|
|
|
|
|
private void loadIndex( File fastaFile ) throws FileNotFoundException {
|
|
|
|
|
File indexFile = new File(fastaFile.getAbsolutePath() + ".fai");
|
2009-04-15 21:46:56 +08:00
|
|
|
index = new FastaSequenceIndex(indexFile);
|
2009-04-16 01:17:11 +08:00
|
|
|
indexIterator = index.iterator();
|
2009-04-15 21:46:56 +08:00
|
|
|
}
|
|
|
|
|
|
2009-04-16 02:04:13 +08:00
|
|
|
/**
|
|
|
|
|
* Do some basic checking to make sure the dictionary and the index match.
|
|
|
|
|
*/
|
|
|
|
|
private void sanityCheckDictionaryAgainstIndex() {
|
|
|
|
|
// Make sure dictionary and index are the same size.
|
|
|
|
|
if( sequenceDictionary.getSequences().size() != index.size() )
|
|
|
|
|
throw new PicardException("Sequence dictionary and index contain different numbers of contigs");
|
|
|
|
|
|
|
|
|
|
for( SAMSequenceRecord sequenceRecord: sequenceDictionary.getSequences() ) {
|
|
|
|
|
// Make sure sequence name is present in the index.
|
|
|
|
|
String sequenceName = sequenceRecord.getSequenceName();
|
|
|
|
|
if( !index.hasIndexEntry(sequenceName) )
|
|
|
|
|
throw new PicardException("Index does not contain dictionary entry: " + sequenceName );
|
|
|
|
|
|
|
|
|
|
// Make sure sequence length matches index length.
|
|
|
|
|
if( sequenceRecord.getSequenceLength() != index.getIndexEntry(sequenceName).getSize())
|
|
|
|
|
throw new PicardException("Index length does not match dictionary length for contig: " + sequenceName );
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2009-04-15 21:46:56 +08:00
|
|
|
public SAMSequenceDictionary getSequenceDictionary() {
|
2009-04-16 02:04:13 +08:00
|
|
|
return sequenceDictionary;
|
2009-04-15 21:46:56 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public ReferenceSequence getSequence( String contig ) {
|
|
|
|
|
return getSubsequenceAt( contig, 0, (int)index.getIndexEntry(contig).getSize() );
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public ReferenceSequence getSubsequenceAt( String contig, int pos, int length ) {
|
|
|
|
|
FastaSequenceIndexEntry indexEntry = index.getIndexEntry(contig);
|
|
|
|
|
|
2009-04-16 01:17:11 +08:00
|
|
|
if(pos + length - 1 > indexEntry.getSize())
|
|
|
|
|
throw new PicardException("Query asks for data past end of contig");
|
|
|
|
|
|
2009-04-15 21:46:56 +08:00
|
|
|
final int basesPerLine = indexEntry.getBasesPerLine();
|
2009-04-16 01:17:11 +08:00
|
|
|
final int bytesPerLine = indexEntry.getBytesPerLine();
|
2009-04-15 21:46:56 +08:00
|
|
|
|
|
|
|
|
// Start reading at the closest start-of-line to our data.
|
2009-04-16 01:17:11 +08:00
|
|
|
long readStart = indexEntry.getLocation() + (pos / basesPerLine) * bytesPerLine;
|
2009-04-15 21:46:56 +08:00
|
|
|
int dataOfInterestStart = pos % basesPerLine;
|
|
|
|
|
|
|
|
|
|
byte[] accumulator = new byte[length];
|
|
|
|
|
int nextAccumulatorSlot = 0;
|
|
|
|
|
|
|
|
|
|
while(length > 0) {
|
|
|
|
|
ByteBuffer buffer = ByteBuffer.allocateDirect(BUFFERSIZE);
|
|
|
|
|
try {
|
|
|
|
|
channel.read(buffer, readStart);
|
|
|
|
|
readStart += BUFFERSIZE;
|
|
|
|
|
}
|
|
|
|
|
catch( IOException ex ) {
|
|
|
|
|
throw new PicardException("Unable to read directly from fasta", ex);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
final int basesTransferred = transferToBuffer( buffer,
|
|
|
|
|
dataOfInterestStart,
|
|
|
|
|
accumulator,
|
|
|
|
|
nextAccumulatorSlot,
|
|
|
|
|
length );
|
|
|
|
|
|
|
|
|
|
nextAccumulatorSlot += basesTransferred;
|
|
|
|
|
length -= basesTransferred;
|
|
|
|
|
dataOfInterestStart = 0;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return new ReferenceSequence( contig, pos, accumulator );
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Transfers the contents of the given ByteBuffer to the given byte array, discarding
|
|
|
|
|
* line breaks at regular intervals. Copies as many as length bases, depending on the
|
|
|
|
|
* buffer size. Returns the number of bytes actually copied.
|
|
|
|
|
* @param source The source ByteBuffer.
|
|
|
|
|
* @param sourceStart The starting position to copy within the byte buffer
|
|
|
|
|
* @param target Destination for the data
|
|
|
|
|
* @param targetStart Index into target buffer.
|
|
|
|
|
* @param length How much data to move.
|
|
|
|
|
* @return How many bytes were actually transferred.
|
|
|
|
|
*/
|
|
|
|
|
private int transferToBuffer( ByteBuffer source,
|
|
|
|
|
int sourceStart,
|
|
|
|
|
byte[] target,
|
|
|
|
|
int targetStart,
|
|
|
|
|
int length ) {
|
|
|
|
|
source.position(sourceStart);
|
|
|
|
|
int basesRead = 0;
|
|
|
|
|
CharsetDecoder decoder = Charset.forName("US-ASCII").newDecoder();
|
|
|
|
|
|
|
|
|
|
Scanner scanner = null;
|
|
|
|
|
try {
|
|
|
|
|
scanner = new Scanner(decoder.decode(source).toString());
|
|
|
|
|
}
|
|
|
|
|
catch(CharacterCodingException ex) {
|
|
|
|
|
throw new PicardException("Malformed subsequence",ex);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
while( scanner.hasNext() && basesRead < length ) {
|
|
|
|
|
String sourceLine = scanner.nextLine();
|
|
|
|
|
byte[] sourceData = sourceLine.getBytes();
|
|
|
|
|
int basesToTransfer = Math.min(sourceData.length,length - basesRead);
|
|
|
|
|
System.arraycopy(sourceData,0,target,targetStart+basesRead,basesToTransfer);
|
|
|
|
|
|
|
|
|
|
basesRead += basesToTransfer;
|
|
|
|
|
}
|
|
|
|
|
return basesRead;
|
|
|
|
|
}
|
|
|
|
|
|
2009-04-16 01:17:11 +08:00
|
|
|
/**
|
|
|
|
|
* Gets the next sequence if available, or null if not present.
|
|
|
|
|
* @return next sequence if available, or null if not present.
|
|
|
|
|
*/
|
2009-04-15 21:46:56 +08:00
|
|
|
public ReferenceSequence nextSequence() {
|
2009-04-16 01:17:11 +08:00
|
|
|
if( !indexIterator.hasNext() )
|
|
|
|
|
return null;
|
|
|
|
|
return getSequence( indexIterator.next().getContig() );
|
2009-04-15 21:46:56 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public String toString() {
|
|
|
|
|
return this.file.getAbsolutePath();
|
|
|
|
|
}
|
|
|
|
|
}
|