From 52ad08298a91ba457c3d3656dac2c3f6617458f0 Mon Sep 17 00:00:00 2001 From: depristo Date: Sun, 22 Mar 2009 19:43:56 +0000 Subject: [PATCH] New FastaSequenceFile with support for poor-man's seek and querying the next contig name without loading the whole next contig into memory. Vastly speeds up the performance of jumping to distant parts of the genome with the location operator. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@137 348d0f76-0448-11de-a6fe-93d51630548a --- .../sting/utils/FastaSequenceFile2.java | 466 ++++++++++++++++++ 1 file changed, 466 insertions(+) create mode 100644 playground/java/src/org/broadinstitute/sting/utils/FastaSequenceFile2.java diff --git a/playground/java/src/org/broadinstitute/sting/utils/FastaSequenceFile2.java b/playground/java/src/org/broadinstitute/sting/utils/FastaSequenceFile2.java new file mode 100644 index 000000000..520a1ba25 --- /dev/null +++ b/playground/java/src/org/broadinstitute/sting/utils/FastaSequenceFile2.java @@ -0,0 +1,466 @@ +package org.broadinstitute.sting.utils; + +import edu.mit.broad.picard.PicardException; +import edu.mit.broad.picard.reference.ReferenceSequenceFile; +import edu.mit.broad.picard.reference.ReferenceSequence; +import edu.mit.broad.picard.io.IoUtil; + +import java.io.*; + +import net.sf.samtools.SAMTextHeaderCodec; +import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMSequenceDictionary; +import net.sf.samtools.SAMSequenceRecord; +import net.sf.samtools.util.AsciiLineReader; +import net.sf.samtools.util.StringUtil; +import net.sf.samtools.util.RuntimeIOException; + +/** + * Implementation of ReferenceSequenceFile for reading from FASTA files. + * + * Now supports additional operations to seek and query the next contig from the file, for efficient + * implementation of jumping forward in the file. + * + * @author Tim Fennell + * @author Extended in parts by Mark DePristo + */ +public class FastaSequenceFile2 implements ReferenceSequenceFile { + private final File file; + private BufferedInputStream in; + private SAMSequenceDictionary sequenceDictionary = null; + private String currentContigName = null; + + /** + * Set to true to see lots of debugging output during operation + */ + private final boolean DEBUG = false; + + /** + * The name, if known, of the next contig in the file. Can be null, indicating either that there is + * no known next contig name, or that the file has been completely read + */ + private String nextContigName = null; + + /** Constructs a FastaSequenceFile that reads from the specified file. */ + public FastaSequenceFile2(final File file) { + this.file = file; + initializeInputStream(); + + // Try and locate the dictionary + String dictionaryName = file.getAbsolutePath(); + dictionaryName = dictionaryName.substring(0, dictionaryName.lastIndexOf(".fasta")); + dictionaryName += ".dict"; + final File dictionary = new File(dictionaryName); + if (dictionary.exists()) { + IoUtil.assertFileIsReadable(dictionary); + + try { + final SAMTextHeaderCodec codec = new SAMTextHeaderCodec(); + final SAMFileHeader header = codec.decode(new AsciiLineReader(new FileInputStream(dictionary)), dictionary); + if (header.getSequenceDictionary() != null && header.getSequenceDictionary().size() > 0) { + this.sequenceDictionary = header.getSequenceDictionary(); + } + } + catch (Exception e) { + throw new PicardException("Could not open sequence dictionary file: " + dictionaryName, e); + } + } + } + + /** + * Opens the file for input. Closes the previous input stream if there is one, and sets up all of the key + * variables such that this object's state is clean and ready for processing sequences. + */ + private void initializeInputStream() { + // we're at the start and haven't processed anything yet + nextContigName = currentContigName = null; + + if ( this.in != null ) { // this isn't our first time here + try { + this.in.close(); + } catch ( IOException e ) { + throw new RuntimeIOException("initializing InputStream failure", e); + } + } + + // Now reopen the input stream + this.in = new BufferedInputStream(IoUtil.openFileForReading(file)); + } + + /** + * Returns the list of sequence records associated with the reference sequence if found + * otherwise null. + */ + public SAMSequenceDictionary getSequenceDictionary() { + return this.sequenceDictionary; + } + + // -------------------------------------------------------------------------------------------------------------- + // + // Support functions for seeking around in the file + // + // -------------------------------------------------------------------------------------------------------------- + + /** + * Returns the distance, in bp, between contig1 and contig2 according to this fasta file's dictionary. That is, + * the number of bp we'd have to traverse to move from the start of contig1 to reach the start of contig2. + * + * If contig1 occurs before contig2, a negative number is returned. 0 indicates that the contigs are the same. + * + * Returns Integer.MAX_VALUE if the sequence dictionary cannot be found + * + * @param contig1 + * @param contig2 + * @return distance in bp from the start of contig1 to the start of contig2 + */ + public long getDistanceBetweenContigs(final String contig1, final String contig2) { + assert contig1 != null; + assert contig2 != null; + + final SAMSequenceDictionary seqDict = getSequenceDictionary(); + + if ( seqDict == null ) // we couldn't load the reference dictionary + return Integer.MAX_VALUE; + + SAMSequenceRecord contig1Rec = seqDict.getSequence(contig1); + SAMSequenceRecord contig2Rec = seqDict.getSequence(contig2); + + assert contig1Rec != null : "Contig1 record is null: " + contig1; + assert contig1Rec != null : "Contig2 record is null: " + contig2; + + if ( DEBUG ) + System.out.printf("Contig1=(%s, %d), contig2=(%s, %d)%n", + contig1, contig1Rec.getSequenceIndex(), + contig2, contig2Rec.getSequenceIndex()); + + int startIndex = Math.min(contig1Rec.getSequenceIndex(), contig2Rec.getSequenceIndex()); + int lastIndex = Math.max(contig1Rec.getSequenceIndex(), contig2Rec.getSequenceIndex()); + + long bytesToTraverse = 0; + for ( int i = startIndex; i < lastIndex; i++ ) { + SAMSequenceRecord rec = seqDict.getSequence(i); + bytesToTraverse += rec.getSequenceLength(); + if ( DEBUG ) + System.out.printf(" -> Traversing from %15s to %15s requires reading at least %10d bytes to pass contig %15s, total bytes %10d%n", + contig1, contig2, rec.getSequenceLength(), rec.getSequenceName(), bytesToTraverse); + } + + if ( contig1Rec.getSequenceIndex() > contig2Rec.getSequenceIndex() ) + bytesToTraverse *= -1; // we are going backward! + + if ( DEBUG ) System.out.printf(" -> total distance is %d%n", bytesToTraverse); + + return bytesToTraverse; + } + + /** + * Seeks to seekContig in the fasta file, such that nextSequence() will read the seekContig from the fasta + * file. Only allows forward seeks. Throws a RuntimeIOException if the seekContig is before the current + * contig. + * + * @param seekContig the contig I want to seek to + * @return true on success + * + * @see #seekToContig(String) + */ + public boolean seekToContig( final String seekContig ) { + return seekToContig(seekContig, false); + } + + /** + * Seeks to seekContig in the fasta file, such that nextSequence() will read the seekContig from the fasta + * file. If enableBacktracking is false, only allows forward seeks, and throws a RuntimeIOException if the + * seekContig is before the current contig. If enableBacktracking is true, then if seekContig is before + * the current contig, resets the input stream and seeks to the contig. + * + * Requires that the fasta file have a SequenceDictionary associated with it. Otherwises throws an error + * + * @param seekContig The contig I want to seek to + * @param enableBacktracking Should we allow seeks to contigs earlier in the file? + * @return true on success + */ + public boolean seekToContig(final String seekContig, boolean enableBacktracking ) { + if ( DEBUG ) System.out.printf("seekToContig( %s, %b )%n", seekContig, enableBacktracking); + + String curContig = getContigName(); + String nextContig = null; + + if ( curContig == null ) { + System.out.printf("CurrentContig is null"); + if ( this.sequenceDictionary == null ) + throw new PicardException( String.format("Seeking within contigs requires FASTA dictionary, but none was available for %s", this.file )); + + // We only reach this point when we're seeking before we've read in any of the fasta file, + // so assume we are at the start of the file + nextContig = this.sequenceDictionary.getSequence(0).getSequenceName(); + } + else + nextContig = getNextContigName(); + + if ( nextContig == null ) // we're are at the end of the stream + return false; + + // we have already read in the current contig, we are jumping from the next contig onwards + long dist = getDistanceBetweenContigs(nextContig, seekContig); + + if ( dist == Integer.MAX_VALUE ) + return false; // we don't know where to go + else if ( dist == 0 ) + return true; // We already here! + else if ( dist < 0 ) { + if ( enableBacktracking ) { + // System.out.printf("*** Backtracking to %s%n", seekContig); + // restart from the beginning, and try again + initializeInputStream(); + return seekToContig(seekContig, enableBacktracking); + } else + return false; // we're not going backwards just yet + } + else { + if ( DEBUG ) System.out.printf("Going to seek to contig %s with skip %d%n", seekContig, dist); + // we're actually going to jump somewhere, so prepare the state + this.nextContigName = null; // reset the contig info + + // TODO: this is a dangerous method -- can we get access to the underlying file object seek? + long bytesToSkip = dist; + while ( bytesToSkip > 0 ) { + try { + final long skipped = this.in.skip(bytesToSkip); + bytesToSkip -= skipped; + // System.out.printf(" -> skipping %d, %d remaining%n", skipped, bytesToSkip); + } catch (IOException ioe) { + throw new PicardException("Error reading from file: " + this.file.getAbsolutePath(), ioe); + } + } + + if ( bytesToSkip != 0 ) { // skip dist bytes + throw new PicardException(String.format("Failed to skip all of the %d bases requested, only got %d", dist, dist - bytesToSkip * 2)); + } + + // at this point we're ready to start looking for the next header, so call seekNextContigName() + final String next = seekForNextContig(seekContig); + + if ( ! next.equals(seekContig) ) // OMG, what the hell happened, throw a runtime exception + throw new PicardException(String.format("Failed to seek from %s to %s, ended up at %s", + curContig, seekContig, next)); + else { + this.currentContigName = next; + return true; + } + } + } + + /** + * Reads the next contig from the fasta file, and returns it as a ReferenceSequence. + * + * @return null if there are no more sequences in the fasta stream + */ + public ReferenceSequence nextSequence() { + if ( DEBUG ) System.out.printf("Calling nextSequence()%n"); + + // Read the header line + currentContigName = getNextContigName(); + if ( currentContigName == null ) return null; // no more sequences! + + int index = -1; + if ( this.sequenceDictionary != null ) + index = this.sequenceDictionary.getSequenceIndex(currentContigName); + + // Read the sequence + byte[] tmp = new byte[4096]; + int basesRead; + int totalBasesRead = 0; + final int knownLength = (index == -1) ? -1 : this.sequenceDictionary.getSequence(index).getSequenceLength(); + final int lengthByteArray = (knownLength != -1) ? knownLength : 250000000; + byte[] bases = new byte[lengthByteArray]; + + while ((basesRead = readNextLine(bases, totalBasesRead)) != 0) { + totalBasesRead += basesRead; + + // Make sure we'll have space for the next iteration if we need it + if (totalBasesRead == knownLength) { + //System.out.printf("Read bases: %s%n", StringUtil.bytesToString(bases, totalBasesRead - basesRead, basesRead).trim()); + + assert peekOneByte() == -1 || peekOneByte() == '>' : "We somehow managed to read in enough bytes for the contig, but didn't pass through the entire contig"; + break; + } else { + final byte b = peekOneByte(); + if (b == -1 || b == '>') { + break; + } + else if (totalBasesRead == bases.length) { + tmp = new byte[bases.length * 2]; + System.arraycopy(bases, 0, tmp, 0, totalBasesRead); + bases = tmp; + tmp = null; + } + } + } + + // And lastly resize the array down to the right size + if (totalBasesRead != bases.length) { + tmp = new byte[totalBasesRead]; + System.arraycopy(bases, 0, tmp, 0, totalBasesRead); + bases = tmp; + tmp = null; + } + + assert knownLength == -1 || knownLength == bases.length; + + this.nextContigName = null; // we no longer know what the next contig name is + + if ( DEBUG ) System.out.printf(" => nextSequence() is returning %s, known length = %d%n", this.currentContigName, knownLength); + if ( DEBUG ) System.out.printf(" => nextSequence() next is %s%n", this.getNextContigName()); + + return new ReferenceSequence(currentContigName, index, bases); + } + + /** + * Returns the next of the next contig, or null if there are no more contigs. Stateful function, it + * remembers the name of the next contig. Use readNextContigName for stateless operation. This function + * is primarily useful if you need to know what the next contig is in the stream for algorithmic purposes. + * + * Note that this function assumes the stream is sitting right at the end of the previous contig, or at the + * beginning of the file. + * + * @return the name of the next contig, or null if there is no next contig + */ + public String getNextContigName() { + if ( DEBUG ) System.out.printf("getNextContigName() => %s%n", this.nextContigName); + + if ( this.nextContigName == null ) { + // If it's not null, we've already looked up the next contig name, just return it and happily continue + // Otherwise we need to actually read in the name + this.nextContigName = readNextContigName(); + } + + if ( DEBUG ) System.out.printf("nextContigName is now %s%n", nextContigName); + return this.nextContigName; + } + + /** + * Simply reads the next contig name from the fasta input stream. It assumes that the stream is positioned + * immediately before the start of the next contig. Calling it without this condition met results in the + * RuntimeIOException being thrown. This method advances the fasta stream itself -- subsequent calls to the + * method will lead to errors. getNextContigName is the stateful version. + * + * @See getNextContigName() + * + * @return the string name of the next contig + */ + private String readNextContigName() { + // Otherwise we need to actually read in the name + byte[] tmp = new byte[4096]; + final int nameLength = readNextLine(tmp, 0); + if (nameLength != 0) { + // 0 means no more sequences! + if ( tmp[0] != '>' ) + throw new RuntimeIOException("The next line is supposed to be a fasta contig start but found " + StringUtil.bytesToString(tmp, 0, nameLength).trim()); + + return StringUtil.bytesToString(tmp, 1, nameLength).trim(); + } + + return null; + } + + /** + * @return The name of the contig we returned in the last call to nextSequence() + */ + public String getContigName() { + return this.currentContigName; + } + + /** + * Moves the IO stream to right before the next contig marker in the fasta file, for anywhere inside the + * previous contig. It is primarily useful as a supplementary routine for jumping forward in the file by + * N bases, since we don't know exactly how far way N bases in bytes will be in the file. So a guess jump + * will put us somewhere before the target contig, and we use this routine to seek forward to the actual + * contig we want. + * + * @return the name of the next contig, as a string + */ + public String seekForNextContig(final String targetContig ) { + //System.out.printf("seekForNextContig()%n"); + + int basesRead; + int totalBasesRead = 0; + byte[] bases = new byte[4096]; + int i = 0; + while ((basesRead = readNextLine(bases, 0)) != 0) { + totalBasesRead += basesRead; + + // Keep looking for the > marking the start of the line, and stop + final byte b = peekOneByte(); + if (b == -1 || b == '>') { + final String foundContig = readNextContigName(); + // System.out.printf("Found a contig name line %s%n", foundContig); + final int foundIndex = this.sequenceDictionary.getSequenceIndex(foundContig); + final int ourIndex = this.sequenceDictionary.getSequenceIndex(targetContig); + + if ( foundIndex == ourIndex ) { + // we found our target! + this.nextContigName = foundContig; // store the right answer + if ( DEBUG ) System.out.printf("seekForNextContig found %s%n", foundContig); + return foundContig; + } + else if ( foundIndex <= ourIndex ) + // we are still looking for our contig, the seek estimate was inaccurate relative to the size of contings in this area + continue; + else { + // This is really bad -- we are past our target + throw new RuntimeIOException(String.format("Seek pushes us past our target contig of %s, instead we found %s, which is after the target in the sequence dictions", targetContig, foundContig)); + } + } + } + + + return null; + } + + /** Peeks one non line-terminating byte from the file and returns it. */ + private byte peekOneByte() { + try { + this.in.mark(16); + byte b = '\n'; + while (b == '\n' || b == '\r') { + b = (byte) this.in.read(); + } + + this.in.reset(); + return b; + } + catch (IOException ioe) { + throw new PicardException("Error reading from file: " + this.file.getAbsolutePath(), ioe); + } + } + + /** + * Reads the next line from the file and puts the bytes into the buffer starting at + * the provided index. Stops when the buffer is full, a line terminator is hit or + * the end of the file is hit. + */ + private int readNextLine(final byte[] buffer, final int index) { + try { + int next; + int read = 0; + while ((next = this.in.read()) != -1 && index + read < buffer.length) { + final byte b = (byte) next; + if (b == '\r' || b == '\n') { + if (read != 0) return read; + } + else { + buffer[index + read++] = b; + } + } + return read; + } + catch (IOException ioe) { + throw new PicardException("Error reading line from file: " + this.file.getAbsolutePath(), ioe); + } + } + + /** Returns the full path to the reference file. */ + public String toString() { + return this.file.getAbsolutePath(); + } +} \ No newline at end of file