2009-04-15 21:46:56 +08:00
|
|
|
package org.broadinstitute.sting.utils.fasta;
|
|
|
|
|
|
2009-05-29 04:13:01 +08:00
|
|
|
import net.sf.picard.PicardException;
|
|
|
|
|
import net.sf.picard.io.IoUtil;
|
2009-10-01 04:37:59 +08:00
|
|
|
import net.sf.picard.reference.ReferenceSequence;
|
|
|
|
|
import net.sf.picard.reference.ReferenceSequenceFile;
|
|
|
|
|
import net.sf.samtools.SAMFileHeader;
|
|
|
|
|
import net.sf.samtools.SAMSequenceDictionary;
|
|
|
|
|
import net.sf.samtools.SAMSequenceRecord;
|
|
|
|
|
import net.sf.samtools.SAMTextHeaderCodec;
|
|
|
|
|
import net.sf.samtools.util.AsciiLineReader;
|
|
|
|
|
import org.broadinstitute.sting.utils.StingException;
|
2009-04-15 21:46:56 +08:00
|
|
|
|
|
|
|
|
import java.io.File;
|
|
|
|
|
import java.io.FileInputStream;
|
|
|
|
|
import java.io.FileNotFoundException;
|
|
|
|
|
import java.io.IOException;
|
|
|
|
|
import java.nio.ByteBuffer;
|
2009-10-01 04:37:59 +08:00
|
|
|
import java.nio.channels.FileChannel;
|
2009-04-16 01:17:11 +08:00
|
|
|
import java.util.Iterator;
|
2009-04-15 21:46:56 +08:00
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* A fasta file driven by an index for fast, concurrent lookups. Supports two interfaces:
|
|
|
|
|
* the ReferenceSequenceFile for old-style, stateful lookups and a direct getter.
|
|
|
|
|
*/
|
|
|
|
|
public class IndexedFastaSequenceFile implements ReferenceSequenceFile {
|
2009-12-09 00:09:04 +08:00
|
|
|
/**
|
|
|
|
|
* Stores the main fasta file.
|
|
|
|
|
*/
|
2009-04-15 21:46:56 +08:00
|
|
|
private final File file;
|
2009-12-09 00:09:04 +08:00
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* The interface facilitating direct access to the fasta.
|
|
|
|
|
*/
|
2009-04-15 21:46:56 +08:00
|
|
|
private FileChannel channel;
|
|
|
|
|
|
2009-12-09 00:09:04 +08:00
|
|
|
/**
|
|
|
|
|
* A representation of the sequence dictionary, stored alongside the fasta in a .dict file.
|
|
|
|
|
*/
|
|
|
|
|
private SAMSequenceDictionary sequenceDictionary = null;
|
2009-04-16 02:04:13 +08:00
|
|
|
|
2009-12-09 00:09:04 +08:00
|
|
|
/**
|
|
|
|
|
* A representation of the sequence index, stored alongside the fasta in a .fasta.fai file.
|
|
|
|
|
*/
|
2009-04-16 02:04:13 +08:00
|
|
|
private FastaSequenceIndex index;
|
2009-12-09 00:09:04 +08:00
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* An iterator into the fasta index, for traversing iteratively across the fasta.
|
|
|
|
|
*/
|
2009-04-16 02:04:13 +08:00
|
|
|
private Iterator<FastaSequenceIndexEntry> indexIterator;
|
2009-04-15 21:46:56 +08:00
|
|
|
|
2009-12-09 00:09:04 +08:00
|
|
|
/**
|
|
|
|
|
* Open the given indexed fasta sequence file. Throw an exception if the file cannot be opened.
|
|
|
|
|
* @param file The file to open.
|
|
|
|
|
* @throws FileNotFoundException If the fasta or any of its supporting files cannot be found.
|
|
|
|
|
*/
|
2009-04-15 21:46:56 +08:00
|
|
|
public IndexedFastaSequenceFile(File file) throws FileNotFoundException {
|
|
|
|
|
this.file = file;
|
2009-12-09 00:09:04 +08:00
|
|
|
FileInputStream in = new FileInputStream(file);
|
2009-04-15 21:46:56 +08:00
|
|
|
channel = in.getChannel();
|
|
|
|
|
|
2009-04-16 02:04:13 +08:00
|
|
|
loadDictionary(file);
|
|
|
|
|
loadIndex(file);
|
|
|
|
|
sanityCheckDictionaryAgainstIndex();
|
|
|
|
|
}
|
|
|
|
|
|
2010-06-11 04:10:23 +08:00
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Open the given indexed fasta sequence file. Throw an exception if the file cannot be opened.
|
|
|
|
|
* @param file The file to open.
|
|
|
|
|
* @param sequenceIndex FastaSequenceIndex that was previously created
|
|
|
|
|
* @throws FileNotFoundException If the fasta or any of its supporting files cannot be found.
|
|
|
|
|
*/
|
|
|
|
|
public IndexedFastaSequenceFile(File file, FastaSequenceIndex sequenceIndex) throws FileNotFoundException {
|
|
|
|
|
this.file = file;
|
|
|
|
|
FileInputStream in = new FileInputStream(file);
|
|
|
|
|
channel = in.getChannel();
|
|
|
|
|
|
|
|
|
|
loadDictionary(file);
|
|
|
|
|
// Temporary change: sequenceIndex is passed in directly. See comments in ReferenceDataSource.
|
|
|
|
|
index = sequenceIndex;
|
|
|
|
|
sanityCheckDictionaryAgainstIndex();
|
|
|
|
|
}
|
|
|
|
|
|
2010-06-17 07:55:07 +08:00
|
|
|
/**
|
|
|
|
|
* Always returns true for this implementation.
|
|
|
|
|
* @return True.
|
|
|
|
|
*/
|
|
|
|
|
public boolean isIndexed() {
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
2009-04-16 02:04:13 +08:00
|
|
|
/**
|
|
|
|
|
* Loads a dictionary, if available.
|
|
|
|
|
* @param fastaFile File to check for a match.
|
|
|
|
|
*/
|
|
|
|
|
private void loadDictionary( File fastaFile ) {
|
|
|
|
|
// Try and locate the dictionary
|
|
|
|
|
String dictionaryName = fastaFile.getAbsolutePath();
|
2009-07-10 02:18:48 +08:00
|
|
|
dictionaryName = dictionaryName.substring(0, getFastaFileExtensionStart(dictionaryName));
|
2009-04-16 02:04:13 +08:00
|
|
|
dictionaryName += ".dict";
|
|
|
|
|
final File dictionary = new File(dictionaryName);
|
2009-05-18 05:57:12 +08:00
|
|
|
if (!dictionary.exists())
|
|
|
|
|
throw new PicardException("Unable to load .dict file. Dictionary is required for the indexed fasta reader.");
|
2009-04-16 02:04:13 +08:00
|
|
|
|
2009-05-18 05:57:12 +08:00
|
|
|
IoUtil.assertFileIsReadable(dictionary);
|
|
|
|
|
|
|
|
|
|
try {
|
|
|
|
|
final SAMTextHeaderCodec codec = new SAMTextHeaderCodec();
|
2009-10-17 03:38:56 +08:00
|
|
|
final SAMFileHeader header = codec.decode(new AsciiLineReader(new FileInputStream(dictionary)),
|
|
|
|
|
dictionary.toString());
|
2009-05-18 05:57:12 +08:00
|
|
|
if (header.getSequenceDictionary() != null && header.getSequenceDictionary().size() > 0) {
|
|
|
|
|
this.sequenceDictionary = header.getSequenceDictionary();
|
2009-04-16 02:04:13 +08:00
|
|
|
}
|
|
|
|
|
}
|
2009-05-18 05:57:12 +08:00
|
|
|
catch (Exception e) {
|
|
|
|
|
throw new PicardException("Could not open sequence dictionary file: " + dictionaryName, e);
|
|
|
|
|
}
|
2009-04-16 02:04:13 +08:00
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
2009-07-10 02:18:48 +08:00
|
|
|
/**
|
|
|
|
|
* Gets the index of the first character in the fasta file's extension.
|
|
|
|
|
* @param filename The filename of the fasta. Must not be null, and must end with either '.fasta' or '.fa'.
|
|
|
|
|
* @return The index of the start of the extension within the filename. If neither '.fasta' nor '.fa' are
|
|
|
|
|
* present in the filename, a StingException will be thrown.
|
|
|
|
|
*/
|
|
|
|
|
private int getFastaFileExtensionStart( String filename ) {
|
|
|
|
|
if( filename.endsWith(".fasta") )
|
|
|
|
|
return filename.lastIndexOf(".fasta");
|
|
|
|
|
else if( filename.endsWith(".fa") )
|
|
|
|
|
return filename.lastIndexOf(".fa");
|
|
|
|
|
else
|
|
|
|
|
throw new StingException("Invalid fasta filename; fasta filename must end with '.fasta' or '.fa'.");
|
|
|
|
|
}
|
|
|
|
|
|
2009-04-16 02:04:13 +08:00
|
|
|
/**
|
|
|
|
|
* Loads the index for the fasta, if present. Throws an exception if now present.
|
2009-10-10 01:46:56 +08:00
|
|
|
* @param fastaFile FASTA file to load.
|
|
|
|
|
* @throws FileNotFoundException if FASTA file cannot be found.
|
2009-04-16 02:04:13 +08:00
|
|
|
*/
|
|
|
|
|
private void loadIndex( File fastaFile ) throws FileNotFoundException {
|
|
|
|
|
File indexFile = new File(fastaFile.getAbsolutePath() + ".fai");
|
2009-06-01 23:34:38 +08:00
|
|
|
if (!indexFile.exists())
|
|
|
|
|
throw new PicardException(String.format("Unable to load fasta index file %s. "+
|
|
|
|
|
"Please create it using 'samtools faidx'.",indexFile.getAbsolutePath()));
|
2009-04-15 21:46:56 +08:00
|
|
|
index = new FastaSequenceIndex(indexFile);
|
2009-10-10 01:46:56 +08:00
|
|
|
reset();
|
2009-04-15 21:46:56 +08:00
|
|
|
}
|
|
|
|
|
|
2009-04-16 02:04:13 +08:00
|
|
|
/**
|
|
|
|
|
* Do some basic checking to make sure the dictionary and the index match.
|
|
|
|
|
*/
|
|
|
|
|
private void sanityCheckDictionaryAgainstIndex() {
|
|
|
|
|
// Make sure dictionary and index are the same size.
|
|
|
|
|
if( sequenceDictionary.getSequences().size() != index.size() )
|
|
|
|
|
throw new PicardException("Sequence dictionary and index contain different numbers of contigs");
|
|
|
|
|
|
|
|
|
|
for( SAMSequenceRecord sequenceRecord: sequenceDictionary.getSequences() ) {
|
|
|
|
|
// Make sure sequence name is present in the index.
|
|
|
|
|
String sequenceName = sequenceRecord.getSequenceName();
|
|
|
|
|
if( !index.hasIndexEntry(sequenceName) )
|
|
|
|
|
throw new PicardException("Index does not contain dictionary entry: " + sequenceName );
|
|
|
|
|
|
|
|
|
|
// Make sure sequence length matches index length.
|
|
|
|
|
if( sequenceRecord.getSequenceLength() != index.getIndexEntry(sequenceName).getSize())
|
|
|
|
|
throw new PicardException("Index length does not match dictionary length for contig: " + sequenceName );
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2009-04-25 03:40:21 +08:00
|
|
|
/**
|
|
|
|
|
* Retrieves the sequence dictionary for the fasta file.
|
|
|
|
|
* @return sequence dictionary of the fasta.
|
|
|
|
|
*/
|
2009-04-15 21:46:56 +08:00
|
|
|
public SAMSequenceDictionary getSequenceDictionary() {
|
2009-04-16 02:04:13 +08:00
|
|
|
return sequenceDictionary;
|
2009-04-15 21:46:56 +08:00
|
|
|
}
|
|
|
|
|
|
2009-04-25 03:40:21 +08:00
|
|
|
/**
|
|
|
|
|
* Retrieves the complete sequence described by this contig.
|
|
|
|
|
* @param contig contig whose data should be returned.
|
|
|
|
|
* @return The full sequence associated with this contig.
|
|
|
|
|
*/
|
2009-04-15 21:46:56 +08:00
|
|
|
public ReferenceSequence getSequence( String contig ) {
|
2009-04-25 03:40:21 +08:00
|
|
|
return getSubsequenceAt( contig, 1, (int)index.getIndexEntry(contig).getSize() );
|
2009-04-15 21:46:56 +08:00
|
|
|
}
|
|
|
|
|
|
2009-04-25 03:40:21 +08:00
|
|
|
/**
|
|
|
|
|
* Gets the subsequence of the contig in the range [start,stop]
|
|
|
|
|
* @param contig Contig whose subsequence to retrieve.
|
|
|
|
|
* @param start inclusive, 1-based start of region.
|
|
|
|
|
* @param stop inclusive, 1-based stop of region.
|
|
|
|
|
* @return The partial reference sequence associated with this range.
|
|
|
|
|
*/
|
2009-04-16 08:04:41 +08:00
|
|
|
public ReferenceSequence getSubsequenceAt( String contig, long start, long stop ) {
|
|
|
|
|
if(start > stop)
|
|
|
|
|
throw new PicardException(String.format("Malformed query; start point %d lies after end point %d",start,stop));
|
|
|
|
|
|
2009-04-15 21:46:56 +08:00
|
|
|
FastaSequenceIndexEntry indexEntry = index.getIndexEntry(contig);
|
|
|
|
|
|
2009-04-16 08:04:41 +08:00
|
|
|
if(stop > indexEntry.getSize())
|
2009-04-16 01:17:11 +08:00
|
|
|
throw new PicardException("Query asks for data past end of contig");
|
|
|
|
|
|
2009-04-16 08:04:41 +08:00
|
|
|
int length = (int)(stop - start + 1);
|
|
|
|
|
|
2009-10-10 01:46:56 +08:00
|
|
|
byte[] target = new byte[length];
|
|
|
|
|
ByteBuffer targetBuffer = ByteBuffer.wrap(target);
|
|
|
|
|
|
2009-04-15 21:46:56 +08:00
|
|
|
final int basesPerLine = indexEntry.getBasesPerLine();
|
2009-04-16 01:17:11 +08:00
|
|
|
final int bytesPerLine = indexEntry.getBytesPerLine();
|
2009-04-15 21:46:56 +08:00
|
|
|
|
2009-10-10 01:46:56 +08:00
|
|
|
final long startOffset = ((start-1)/basesPerLine)*bytesPerLine + (start-1)%basesPerLine;
|
|
|
|
|
final long stopOffset = ((stop-1)/basesPerLine)*bytesPerLine + (stop-1)%basesPerLine;
|
|
|
|
|
final int size = (int)(stopOffset-startOffset)+1;
|
2009-10-09 12:45:46 +08:00
|
|
|
|
2009-10-10 01:46:56 +08:00
|
|
|
ByteBuffer channelBuffer = ByteBuffer.allocate(size);
|
2009-04-15 21:46:56 +08:00
|
|
|
try {
|
2009-10-10 01:46:56 +08:00
|
|
|
channel.read(channelBuffer,indexEntry.getLocation()+startOffset);
|
2009-04-15 21:46:56 +08:00
|
|
|
}
|
2009-10-10 01:46:56 +08:00
|
|
|
catch(IOException ex) {
|
|
|
|
|
throw new PicardException("Unable to map FASTA file into memory.");
|
2009-04-15 21:46:56 +08:00
|
|
|
}
|
|
|
|
|
|
2009-10-10 01:46:56 +08:00
|
|
|
channelBuffer.position(0);
|
|
|
|
|
channelBuffer.limit(Math.min(basesPerLine-(int)startOffset%bytesPerLine,size));
|
|
|
|
|
|
|
|
|
|
while( channelBuffer.hasRemaining() ) {
|
|
|
|
|
targetBuffer.put(channelBuffer);
|
2009-04-15 21:46:56 +08:00
|
|
|
|
2009-10-10 01:46:56 +08:00
|
|
|
channelBuffer.limit(Math.min(channelBuffer.limit()+bytesPerLine,size));
|
|
|
|
|
channelBuffer.position(Math.min(channelBuffer.position()+bytesPerLine-basesPerLine,size));
|
2009-04-15 21:46:56 +08:00
|
|
|
}
|
2009-10-10 01:46:56 +08:00
|
|
|
|
|
|
|
|
return new ReferenceSequence( contig, sequenceDictionary.getSequenceIndex(contig), target );
|
2009-04-15 21:46:56 +08:00
|
|
|
}
|
|
|
|
|
|
2009-04-16 01:17:11 +08:00
|
|
|
/**
|
|
|
|
|
* Gets the next sequence if available, or null if not present.
|
|
|
|
|
* @return next sequence if available, or null if not present.
|
|
|
|
|
*/
|
2009-04-15 21:46:56 +08:00
|
|
|
public ReferenceSequence nextSequence() {
|
2009-04-16 01:17:11 +08:00
|
|
|
if( !indexIterator.hasNext() )
|
|
|
|
|
return null;
|
|
|
|
|
return getSequence( indexIterator.next().getContig() );
|
2009-04-15 21:46:56 +08:00
|
|
|
}
|
|
|
|
|
|
2009-12-09 00:09:04 +08:00
|
|
|
/**
|
|
|
|
|
* Reset the iterator over the index.
|
|
|
|
|
*/
|
2009-10-01 04:37:59 +08:00
|
|
|
@Override
|
|
|
|
|
public void reset() {
|
2009-10-10 01:46:56 +08:00
|
|
|
indexIterator = index.iterator();
|
2009-10-01 04:37:59 +08:00
|
|
|
}
|
|
|
|
|
|
2009-12-09 00:09:04 +08:00
|
|
|
/**
|
|
|
|
|
* A simple toString implementation for debugging.
|
|
|
|
|
* @return String representation of the file.
|
|
|
|
|
*/
|
2009-04-15 21:46:56 +08:00
|
|
|
public String toString() {
|
|
|
|
|
return this.file.getAbsolutePath();
|
|
|
|
|
}
|
|
|
|
|
}
|