Basic indexed fasta POC in place. Requires a more complete implementation of the ReferenceSequenceFile interface,
and much more testing. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@425 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
7949e377e4
commit
182626576f
|
|
@ -1,6 +1,7 @@
|
|||
package org.broadinstitute.sting.utils.fasta;
|
||||
|
||||
import edu.mit.broad.picard.PicardException;
|
||||
import edu.mit.broad.picard.io.IoUtil;
|
||||
|
||||
import java.util.Scanner;
|
||||
import java.util.HashMap;
|
||||
|
|
@ -29,6 +30,20 @@ public class FastaSequenceIndex {
|
|||
* @throws PicardException if file is of invalid format.
|
||||
*/
|
||||
public FastaSequenceIndex( File indexFile ) throws FileNotFoundException {
|
||||
if(!indexFile.exists())
|
||||
throw new FileNotFoundException("Index file is missing");
|
||||
|
||||
IoUtil.assertFileIsReadable(indexFile);
|
||||
parseIndexFile(indexFile);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Parse the contents of an index file, caching the results internally.
|
||||
* @param indexFile File to parse.
|
||||
* @throws FileNotFoundException Thrown if file could not be opened.
|
||||
*/
|
||||
private void parseIndexFile(File indexFile) throws FileNotFoundException {
|
||||
Scanner scanner = new Scanner(indexFile);
|
||||
|
||||
while( scanner.hasNext() ) {
|
||||
|
|
|
|||
|
|
@ -0,0 +1,142 @@
|
|||
package org.broadinstitute.sting.utils.fasta;
|
||||
|
||||
import edu.mit.broad.picard.reference.ReferenceSequenceFile;
|
||||
import edu.mit.broad.picard.reference.ReferenceSequence;
|
||||
import edu.mit.broad.picard.PicardException;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.IOException;
|
||||
import java.nio.channels.FileChannel;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.charset.CharsetDecoder;
|
||||
import java.nio.charset.Charset;
|
||||
import java.nio.charset.CharacterCodingException;
|
||||
import java.util.Scanner;
|
||||
|
||||
import net.sf.samtools.SAMSequenceDictionary;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: hanna
|
||||
* Date: Apr 14, 2009
|
||||
* Time: 2:14:26 PM
|
||||
*
|
||||
* A fasta file driven by an index for fast, concurrent lookups. Supports two interfaces:
|
||||
* the ReferenceSequenceFile for old-style, stateful lookups and a direct getter.
|
||||
*/
|
||||
public class IndexedFastaSequenceFile implements ReferenceSequenceFile {
|
||||
// Using buffer size of 4k because that's what Picard uses; no thought went into this.
|
||||
private static final int BUFFERSIZE = 4096;
|
||||
|
||||
private final File file;
|
||||
private FileInputStream in;
|
||||
private FileChannel channel;
|
||||
|
||||
private final FastaSequenceIndex index;
|
||||
|
||||
private String currentContigName = null;
|
||||
|
||||
public IndexedFastaSequenceFile(File file) throws FileNotFoundException {
|
||||
this.file = file;
|
||||
// TODO: Add support for gzipped files
|
||||
in = new FileInputStream(file);
|
||||
channel = in.getChannel();
|
||||
|
||||
File indexFile = new File(file.getAbsolutePath() + ".fai");
|
||||
index = new FastaSequenceIndex(indexFile);
|
||||
}
|
||||
|
||||
public SAMSequenceDictionary getSequenceDictionary() {
|
||||
throw new UnsupportedOperationException("Indexed fasta files do not require dictionaries");
|
||||
}
|
||||
|
||||
public ReferenceSequence getSequence( String contig ) {
|
||||
return getSubsequenceAt( contig, 0, (int)index.getIndexEntry(contig).getSize() );
|
||||
}
|
||||
|
||||
public ReferenceSequence getSubsequenceAt( String contig, int pos, int length ) {
|
||||
FastaSequenceIndexEntry indexEntry = index.getIndexEntry(contig);
|
||||
|
||||
final int basesPerLine = indexEntry.getBasesPerLine();
|
||||
|
||||
// Start reading at the closest start-of-line to our data.
|
||||
long readStart = indexEntry.getLocation() + (pos / basesPerLine);
|
||||
int dataOfInterestStart = pos % basesPerLine;
|
||||
|
||||
byte[] accumulator = new byte[length];
|
||||
int nextAccumulatorSlot = 0;
|
||||
|
||||
while(length > 0) {
|
||||
ByteBuffer buffer = ByteBuffer.allocateDirect(BUFFERSIZE);
|
||||
try {
|
||||
channel.read(buffer, readStart);
|
||||
readStart += BUFFERSIZE;
|
||||
}
|
||||
catch( IOException ex ) {
|
||||
throw new PicardException("Unable to read directly from fasta", ex);
|
||||
}
|
||||
|
||||
final int basesTransferred = transferToBuffer( buffer,
|
||||
dataOfInterestStart,
|
||||
accumulator,
|
||||
nextAccumulatorSlot,
|
||||
length );
|
||||
|
||||
nextAccumulatorSlot += basesTransferred;
|
||||
length -= basesTransferred;
|
||||
dataOfInterestStart = 0;
|
||||
}
|
||||
|
||||
return new ReferenceSequence( contig, pos, accumulator );
|
||||
}
|
||||
|
||||
/**
|
||||
* Transfers the contents of the given ByteBuffer to the given byte array, discarding
|
||||
* line breaks at regular intervals. Copies as many as length bases, depending on the
|
||||
* buffer size. Returns the number of bytes actually copied.
|
||||
* @param source The source ByteBuffer.
|
||||
* @param sourceStart The starting position to copy within the byte buffer
|
||||
* @param target Destination for the data
|
||||
* @param targetStart Index into target buffer.
|
||||
* @param length How much data to move.
|
||||
* @return How many bytes were actually transferred.
|
||||
*/
|
||||
private int transferToBuffer( ByteBuffer source,
|
||||
int sourceStart,
|
||||
byte[] target,
|
||||
int targetStart,
|
||||
int length ) {
|
||||
source.position(sourceStart);
|
||||
int basesRead = 0;
|
||||
CharsetDecoder decoder = Charset.forName("US-ASCII").newDecoder();
|
||||
|
||||
Scanner scanner = null;
|
||||
try {
|
||||
scanner = new Scanner(decoder.decode(source).toString());
|
||||
}
|
||||
catch(CharacterCodingException ex) {
|
||||
throw new PicardException("Malformed subsequence",ex);
|
||||
}
|
||||
|
||||
while( scanner.hasNext() && basesRead < length ) {
|
||||
String sourceLine = scanner.nextLine();
|
||||
byte[] sourceData = sourceLine.getBytes();
|
||||
int basesToTransfer = Math.min(sourceData.length,length - basesRead);
|
||||
System.arraycopy(sourceData,0,target,targetStart+basesRead,basesToTransfer);
|
||||
|
||||
basesRead += basesToTransfer;
|
||||
}
|
||||
return basesRead;
|
||||
}
|
||||
|
||||
|
||||
public ReferenceSequence nextSequence() {
|
||||
return getSubsequenceAt("chrM", 0, 20);
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return this.file.getAbsolutePath();
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,111 @@
|
|||
package org.broadinstitute.sting.utils.fasta;
|
||||
|
||||
import org.junit.BeforeClass;
|
||||
import org.junit.Before;
|
||||
import org.junit.Test;
|
||||
import org.junit.Assert;
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
|
||||
import edu.mit.broad.picard.reference.ReferenceSequence;
|
||||
import net.sf.samtools.util.StringUtil;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: hanna
|
||||
* Date: Apr 14, 2009
|
||||
* Time: 2:37:29 PM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
public class IndexedFastaSequenceFileTest extends BaseTest {
|
||||
private static String sequenceFileName;
|
||||
private IndexedFastaSequenceFile sequenceFile = null;
|
||||
|
||||
private final String firstBasesOfChrM = "GATCACAGGTCTATCACCCT";
|
||||
private final String extendedBasesOfChrM = "GATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCAT" +
|
||||
"TTGGTATTTTCGTCTGGGGGGTGTGCACGCGATAGCATTGCGAGACGCTG" +
|
||||
"GAGCCGGAGCACCCTATGTCGCAGTATCTGTCTTTGATTCCTGCCTCATT";
|
||||
private final String firstBasesOfChr1 = "taaccctaaccctaacccta";
|
||||
private final String firstBasesOfChr8 = "GCAATTATGACACAAAAAAT";
|
||||
|
||||
@BeforeClass
|
||||
public static void initialize() {
|
||||
sequenceFileName = seqLocation + "/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta";
|
||||
}
|
||||
|
||||
@Before
|
||||
public void doForEachTest() throws FileNotFoundException {
|
||||
sequenceFile = new IndexedFastaSequenceFile( new File(sequenceFileName) );
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testOpenFile() {
|
||||
long startTime = System.currentTimeMillis();
|
||||
Assert.assertNotNull( sequenceFile );
|
||||
long endTime = System.currentTimeMillis();
|
||||
|
||||
System.err.printf("testOpenFile runtime: %dms%n", (endTime - startTime)) ;
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testFirstSequence() {
|
||||
long startTime = System.currentTimeMillis();
|
||||
ReferenceSequence sequence = sequenceFile.nextSequence();
|
||||
Assert.assertEquals("First sequence contig is not correct", sequence.getName(), "chrM");
|
||||
Assert.assertEquals( "First n bases of chrM are incorrect",
|
||||
firstBasesOfChrM,
|
||||
StringUtil.bytesToString( sequence.getBases() ) );
|
||||
long endTime = System.currentTimeMillis();
|
||||
|
||||
System.err.printf("testFirstSequence runtime: %dms%n", (endTime - startTime)) ;
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testFirstSequenceExtended() {
|
||||
long startTime = System.currentTimeMillis();
|
||||
ReferenceSequence sequence = sequenceFile.getSubsequenceAt("chrM",0,extendedBasesOfChrM.length());
|
||||
Assert.assertEquals("First sequence contig is not correct", sequence.getName(), "chrM");
|
||||
Assert.assertEquals( "First n bases of chrM are incorrect",
|
||||
extendedBasesOfChrM.substring(0,110),
|
||||
StringUtil.bytesToString( sequence.getBases(),0,110 ) );
|
||||
long endTime = System.currentTimeMillis();
|
||||
|
||||
System.err.printf("testFirstSequenceExtended runtime: %dms%n", (endTime - startTime)) ;
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testReadStartingInCenterOfLine() {
|
||||
final int bytesToChopOff = 5;
|
||||
String truncated = extendedBasesOfChrM.substring(bytesToChopOff);
|
||||
|
||||
long startTime = System.currentTimeMillis();
|
||||
ReferenceSequence sequence = sequenceFile.getSubsequenceAt("chrM", bytesToChopOff ,truncated.length() );
|
||||
Assert.assertEquals("First sequence contig is not correct", sequence.getName(), "chrM");
|
||||
Assert.assertEquals( "First n bases of chrM are incorrect",
|
||||
truncated,
|
||||
StringUtil.bytesToString( sequence.getBases() ) );
|
||||
long endTime = System.currentTimeMillis();
|
||||
|
||||
System.err.printf("testReadStartingInCenterOfLine runtime: %dms%n", (endTime - startTime)) ;
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCompleteContigRead() {
|
||||
FastaSequenceFile2 originalSequenceFile = new FastaSequenceFile2(new File(sequenceFileName));
|
||||
ReferenceSequence expectedSequence = originalSequenceFile.nextSequence();
|
||||
|
||||
long startTime = System.currentTimeMillis();
|
||||
ReferenceSequence sequence = sequenceFile.getSequence("chrM");
|
||||
Assert.assertEquals("First sequence contig is not correct", sequence.getName(), "chrM");
|
||||
Assert.assertEquals("chrM is incorrect",
|
||||
StringUtil.bytesToString(expectedSequence.getBases(),0,4096),
|
||||
StringUtil.bytesToString(sequence.getBases(),0,4096) );
|
||||
long endTime = System.currentTimeMillis();
|
||||
|
||||
System.err.printf("testCompleteContigRead runtime: %dms%n", (endTime - startTime)) ;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
Loading…
Reference in New Issue