Cleanup...deprecate FastaSequenceFile2 in favor of IndexedFastaSequenceFile or ReferenceSequenceFile from Picard, depending on the application.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1196 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
hanna 2009-07-08 18:49:08 +00:00
parent 0a67386525
commit 433ad1f060
15 changed files with 54 additions and 950 deletions

View File

@ -1,7 +1,6 @@
package org.broadinstitute.sting.gatk.iterators;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.fasta.FastaSequenceFile2;
import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile;
import java.util.Iterator;

View File

@ -1,213 +0,0 @@
package org.broadinstitute.sting.gatk.iterators;
import net.sf.picard.reference.ReferenceSequence;
import net.sf.samtools.util.RuntimeIOException;
import net.sf.samtools.util.StringUtil;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.fasta.FastaSequenceFile2;
import java.util.Iterator;
import java.util.NoSuchElementException;
/**
* Created by IntelliJ IDEA.
* User: depristo
* Date: Feb 24, 2009
* Time: 10:45:01 AM
* To change this template use File | Settings | File Templates.
*/
public class ReferenceIterator implements Iterator<ReferenceIterator> {
// enable debugging output?
private final boolean DEBUG = false;
// The reference sequence file generator
private FastaSequenceFile2 refFile;
private ReferenceSequence currentContig = null;
//private ReferenceSequence nextContig = null;
private long offset = -1;
public ReferenceIterator(FastaSequenceFile2 refFile) {
this.refFile = refFile;
}
/** our log, which we want to capture anything from this class */
private static Logger logger = Logger.getLogger(ReferenceIterator.class);
// --------------------------------------------------------------------------------------------------------------
//
// Accessing data
//
// --------------------------------------------------------------------------------------------------------------
public byte getBaseAsByte() {
return currentContig.getBases()[(int) offset];
}
public String getBaseAsString() {
assert offset > -1 : currentContig.getName() + " index is " + offset;
//assert offset < currentContig.getBases().();
return StringUtil.bytesToString(currentContig.getBases(), (int) offset, 1);
}
public char getBaseAsChar() {
return getBaseAsString().charAt(0);
}
public ReferenceSequence getCurrentContig() {
return currentContig;
}
public long getPosition() {
return offset + 1;
}
public GenomeLoc getLocation() {
return GenomeLocParser.createGenomeLoc(getCurrentContig().getName(), getPosition());
}
// --------------------------------------------------------------------------------------------------------------
//
// Iterator routines
//
// --------------------------------------------------------------------------------------------------------------
public synchronized boolean hasNext() {
if (currentContig == null || offset + 1 < currentContig.length()) {
return true;
} else {
return readNextContig();
}
}
public synchronized ReferenceIterator next() {
if (currentContig != null) {
if (DEBUG)
logger.debug(String.format(" -> %s:%d %d%n", currentContig.getName(), offset, currentContig.length()));
}
offset++; // move on to the next position
if (currentContig == null || offset >= currentContig.length()) {
// We need to update the contig
if (readNextContig()) {
// We sucessfully loaded the next contig, recursively call next
return next();
} else {
throw new NoSuchElementException();
}
} else {
// We're good to go -- we're in the current contig
return this;
}
}
public void remove() {
throw new UnsupportedOperationException();
}
// --------------------------------------------------------------------------------------------------------------
//
// Jumping forward
//
// --------------------------------------------------------------------------------------------------------------
public ReferenceIterator seekForward(final GenomeLoc loc) {
assert loc != null : "seekForward location is null";
return seekForwardOffset(loc.getContig(), loc.getStart() - 1);
}
public ReferenceIterator seekForward(final String contigName, final long pos) {
return seekForwardOffset(contigName, pos - 1);
}
/**
* Helper routine that doesn't move the contigs around, it just checks that everything is kosher in the seek
* within this chromosome
*
* @param seekContigName name for printing pursues, asserted to be the current contig name
* @param seekOffset where we want to be in this contig
* @return this setup to be at seekoffset within seekContigName
*/
private ReferenceIterator seekForwardOffsetOnSameContig(final String seekContigName, final long seekOffset) {
assert seekContigName.equals(currentContig.getName()) : String.format("only works on this contig, but the current %s and sought %s contigs are different!", currentContig.getName(), seekContigName);
// we're somewhere on this contig
if (seekOffset < offset ) {
// bad boy -- can't go backward safely
throw new IllegalArgumentException(String.format("Invalid seek %s => %s, which is usually due to out of order reads%n",
GenomeLocParser.createGenomeLoc(currentContig.getName(), offset), GenomeLocParser.createGenomeLoc(seekContigName, seekOffset)));
} else if (seekOffset >= currentContig.length()) {
// bad boy -- can't go beyond the contig length
throw new IllegalArgumentException(String.format("Invalid seek to %s, which is beyond the end of the contig%n",
GenomeLocParser.createGenomeLoc(currentContig.getName(), seekOffset + 1)));
} else {
offset = seekOffset - 1;
return next();
}
}
private ReferenceIterator seekForwardOffset(final String seekContigName, final long seekOffset) {
assert seekContigName != null : "seekContigName is null";
assert seekOffset >= 0 : "seekOffset < 0: " + seekOffset;
// jumps us forward in the sequence to the contig / pos
if (currentContig == null)
next();
if (DEBUG)
logger.debug(String.format(" -> Seeking to %s %d from %s %d%n", seekContigName, seekOffset, currentContig.getName(), offset));
int cmpContigs = GenomeLocParser.compareContigs(seekContigName,currentContig.getName());
if ( cmpContigs < 0 && GenomeLocParser.hasKnownContigOrdering() ) { // if we know the order of contigs and we are already past the contig we seek, it's too late!
// The contig we are looking for is before the currentContig -- it's an error
throw new IllegalArgumentException(String.format("Invalid seek %s => %s, contigs/sequences are out of order%n",
GenomeLocParser.createGenomeLoc(currentContig.getName(), offset), GenomeLocParser.createGenomeLoc(seekContigName, seekOffset)));
}
if ( cmpContigs > 0 || (! GenomeLocParser.hasKnownContigOrdering() ) && cmpContigs != 0 ) { // if contig we seek is still ahead, or if we have no idea what the order is and current contig is not what we seek
// then try to seek forward in the reference file until we get the contig we need
if (DEBUG)
logger.debug(String.format(" -> Seeking in the fasta file to %s from %s%n", seekContigName, currentContig.getName()));
if (!refFile.seekToContig(seekContigName)) { // ok, do the seek
// a false result indicates a failure, throw a somewhat cryptic call
throw new RuntimeIOException(String.format("Unexpected seek failure from %s to %s%n",
GenomeLocParser.createGenomeLoc(currentContig.getName(), offset), GenomeLocParser.createGenomeLoc(seekContigName, seekOffset)));
}
readNextContig(); // since we haven't failed, we just read in the next contig (which is seekContigName)
}
// at this point, the current contig is seekContigName, so just do a bit more error checking and be done
return seekForwardOffsetOnSameContig(seekContigName, seekOffset);
}
// --------------------------------------------------------------------------------------------------------------
//
// Interal state manipulation
//
// --------------------------------------------------------------------------------------------------------------
protected boolean readNextContig() {
// returns true if we had another contig to load
currentContig = refFile.nextSequence();
offset = -1;
return currentContig != null;
}
/**
* Simple forwarding method to the refFile itself
*
* @return
*/
public String nextContigName() {
return this.refFile.getNextContigName();
}
}

View File

@ -17,7 +17,6 @@ import org.broadinstitute.sting.gatk.datasources.shards.Shard;
import org.broadinstitute.sting.gatk.datasources.providers.ShardDataProvider;
import org.broadinstitute.sting.gatk.Reads;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.fasta.FastaSequenceFile2;
import java.io.File;
import java.util.*;
@ -54,9 +53,6 @@ public abstract class TraversalEngine {
// The reference data -- filename, refSeqFile, and iterator
protected File refFileName = null; // the name of the reference file
//private ReferenceSequenceFile refFile = null;
protected FastaSequenceFile2 refFile = null; // todo: merge FastaSequenceFile2 into picard!
protected ReferenceIterator refIter = null;
// Progress tracker for the sam file
protected FileProgressTracker samReadingTracker = null;
@ -317,7 +313,6 @@ public abstract class TraversalEngine {
*/
public boolean initialize() {
lastProgressPrintTime = startTime = System.currentTimeMillis();
initializeReference();
// Initial the reference ordered data iterators
initializeRODs();
@ -350,21 +345,6 @@ public abstract class TraversalEngine {
}
}
/**
* Prepare the reference for stream processing
*/
protected void initializeReference() {
if (refFileName != null) {
//this.refFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(refFileName);
this.refFile = new FastaSequenceFile2(refFileName); // todo: replace when FastaSequenceFile2 is in picard
this.refIter = new ReferenceIterator(this.refFile);
if (!GenomeLocParser.setupRefContigOrdering(this.refFile)) {
// We couldn't process the reference contig ordering, fail since we need it
Utils.scareUser(String.format("We couldn't load the contig dictionary associated with %s. At the current time we require this dictionary file to efficiently access the FASTA file. Please use /seq/software/picard/current/bin/CreateSequenceDictionary.jar to create a sequence dictionary for your file", refFileName));
}
}
}
/**
* Prepare the list of reference ordered data iterators for each of the rods
*
@ -378,17 +358,6 @@ public abstract class TraversalEngine {
}
}
/**
* An inappropriately placed testing of reading the reference
*/
protected void testReference() {
while (true) {
ReferenceSequence ref = refFile.nextSequence();
logger.debug(String.format("%s %d %d", ref.getName(), ref.length(), System.currentTimeMillis()));
printProgress(true, "loci", GenomeLocParser.createGenomeLoc("foo", 1));
}
}
// --------------------------------------------------------------------------------------------------------------
//
// shutdown

View File

@ -8,9 +8,7 @@ import org.broadinstitute.sting.gatk.refdata.rodGFF;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import org.broadinstitute.sting.utils.ReadBackedPileup;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.fasta.FastaSequenceFile2;
import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile;
import net.sf.samtools.SAMRecord;
import net.sf.picard.reference.ReferenceSequence;
import java.util.List;

View File

@ -8,7 +8,6 @@ import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.containers.BoundedScoringSet;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import org.broadinstitute.sting.utils.cmdLine.CommandLineProgram;
import org.broadinstitute.sting.utils.fasta.FastaSequenceFile2;
import java.io.File;
import java.util.ArrayList;

View File

@ -3,9 +3,11 @@ package org.broadinstitute.sting.secondarybase;
import net.sf.samtools.SAMFileReader;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.util.CloseableIterator;
import net.sf.picard.reference.ReferenceSequenceFile;
import net.sf.picard.reference.ReferenceSequenceFileFactory;
import net.sf.picard.reference.ReferenceSequence;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.fasta.FastaSequenceFile2;
import java.io.File;
import java.util.ArrayList;
@ -159,7 +161,7 @@ public class BasecallingTrainer {
* @return a vector of perfect reads, grouped by tile
*/
private Vector<HashMap<String, SAMRecord>> getPerfectAlignmentsByTile(File samIn, File reference) {
FastaSequenceFile2 ref = new FastaSequenceFile2(reference);
ReferenceSequenceFile ref = ReferenceSequenceFileFactory.getReferenceSequenceFile(reference);
String currentContig = "none";
byte[] refbases = null;
@ -175,9 +177,11 @@ public class BasecallingTrainer {
int offset = sr.getAlignmentStart();
if (!currentContig.matches(sr.getReferenceName())) {
ref.seekToContig(sr.getReferenceName());
ReferenceSequence refSeq = ref.nextSequence();
while( !refSeq.getName().equals(sr.getReferenceName()) )
refSeq = ref.nextSequence();
currentContig = sr.getReferenceName();
refbases = ref.nextSequence().getBases();
refbases = refSeq.getBases();
}
int mismatches = 0;

View File

@ -1,472 +0,0 @@
package org.broadinstitute.sting.utils.fasta;
import net.sf.picard.PicardException;
import net.sf.picard.reference.ReferenceSequenceFile;
import net.sf.picard.reference.ReferenceSequence;
import net.sf.picard.io.IoUtil;
import java.io.*;
import net.sf.samtools.SAMTextHeaderCodec;
import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMSequenceDictionary;
import net.sf.samtools.SAMSequenceRecord;
import net.sf.samtools.util.AsciiLineReader;
import net.sf.samtools.util.StringUtil;
import net.sf.samtools.util.RuntimeIOException;
import org.apache.log4j.Logger;
/**
* Implementation of ReferenceSequenceFile for reading from FASTA files.
*
* Now supports additional operations to seek and query the next contig from the file, for efficient
* implementation of jumping forward in the file.
*
* @author Tim Fennell
* @author Extended in parts by Mark DePristo
*/
public class FastaSequenceFile2 implements ReferenceSequenceFile {
private final File file;
private BufferedInputStream in;
private SAMSequenceDictionary sequenceDictionary = null;
private String currentContigName = null;
/**
* our log, which we want to capture anything from this class
*/
private static Logger logger = Logger.getLogger(FastaSequenceFile2.class);
/**
* Set to true to see lots of debugging output during operation
*/
private final boolean DEBUG = false;
/**
* The name, if known, of the next contig in the file. Can be null, indicating either that there is
* no known next contig name, or that the file has been completely read
*/
private String nextContigName = null;
/** Constructs a FastaSequenceFile that reads from the specified file. */
public FastaSequenceFile2(final File file) {
this.file = file;
initializeInputStream();
// Try and locate the dictionary
String dictionaryName = file.getAbsolutePath();
dictionaryName = dictionaryName.substring(0, dictionaryName.lastIndexOf(".fasta"));
dictionaryName += ".dict";
final File dictionary = new File(dictionaryName);
if (dictionary.exists()) {
IoUtil.assertFileIsReadable(dictionary);
try {
final SAMTextHeaderCodec codec = new SAMTextHeaderCodec();
final SAMFileHeader header = codec.decode(new AsciiLineReader(new FileInputStream(dictionary)), dictionary);
if (header.getSequenceDictionary() != null && header.getSequenceDictionary().size() > 0) {
this.sequenceDictionary = header.getSequenceDictionary();
}
}
catch (Exception e) {
throw new PicardException("Could not open sequence dictionary file: " + dictionaryName, e);
}
}
}
/**
* Opens the file for input. Closes the previous input stream if there is one, and sets up all of the key
* variables such that this object's state is clean and ready for processing sequences.
*/
private void initializeInputStream() {
// we're at the start and haven't processed anything yet
nextContigName = currentContigName = null;
if ( this.in != null ) { // this isn't our first time here
try {
this.in.close();
} catch ( IOException e ) {
throw new RuntimeIOException("initializing InputStream failure", e);
}
}
// Now reopen the input stream
this.in = new BufferedInputStream(IoUtil.openFileForReading(file));
}
/**
* Returns the list of sequence records associated with the reference sequence if found
* otherwise null.
*/
public SAMSequenceDictionary getSequenceDictionary() {
return this.sequenceDictionary;
}
// --------------------------------------------------------------------------------------------------------------
//
// Support functions for seeking around in the file
//
// --------------------------------------------------------------------------------------------------------------
/**
* Returns the distance, in bp, between contig1 and contig2 according to this fasta file's dictionary. That is,
* the number of bp we'd have to traverse to move from the start of contig1 to reach the start of contig2.
*
* If contig1 occurs before contig2, a negative number is returned. 0 indicates that the contigs are the same.
*
* Returns Integer.MAX_VALUE if the sequence dictionary cannot be found
*
* @param contig1
* @param contig2
* @return distance in bp from the start of contig1 to the start of contig2
*/
public long getDistanceBetweenContigs(final String contig1, final String contig2) {
assert contig1 != null;
assert contig2 != null;
final SAMSequenceDictionary seqDict = getSequenceDictionary();
if ( seqDict == null ) // we couldn't load the reference dictionary
return Integer.MAX_VALUE;
SAMSequenceRecord contig1Rec = seqDict.getSequence(contig1);
SAMSequenceRecord contig2Rec = seqDict.getSequence(contig2);
assert contig1Rec != null : "Contig1 record is null: " + contig1;
assert contig1Rec != null : "Contig2 record is null: " + contig2;
if ( DEBUG )
logger.debug(String.format("Contig1=(%s, %d), contig2=(%s, %d)%n",
contig1, contig1Rec.getSequenceIndex(),
contig2, contig2Rec.getSequenceIndex()));
int startIndex = Math.min(contig1Rec.getSequenceIndex(), contig2Rec.getSequenceIndex());
int lastIndex = Math.max(contig1Rec.getSequenceIndex(), contig2Rec.getSequenceIndex());
long bytesToTraverse = 0;
for ( int i = startIndex; i < lastIndex; i++ ) {
SAMSequenceRecord rec = seqDict.getSequence(i);
bytesToTraverse += rec.getSequenceLength();
if ( DEBUG )
logger.debug(String.format(" -> Traversing from %15s to %15s requires reading at least %10d bytes to pass contig %15s, total bytes %10d%n",
contig1, contig2, rec.getSequenceLength(), rec.getSequenceName(), bytesToTraverse));
}
if ( contig1Rec.getSequenceIndex() > contig2Rec.getSequenceIndex() )
bytesToTraverse *= -1; // we are going backward!
if ( DEBUG ) logger.debug(String.format(" -> total distance is %d%n", bytesToTraverse));
return bytesToTraverse;
}
/**
* Seeks to seekContig in the fasta file, such that nextSequence() will read the seekContig from the fasta
* file. Only allows forward seeks. Throws a RuntimeIOException if the seekContig is before the current
* contig.
*
* @param seekContig the contig I want to seek to
* @return true on success
*
* @see #seekToContig(String)
*/
public boolean seekToContig( final String seekContig ) {
return seekToContig(seekContig, false);
}
/**
* Seeks to seekContig in the fasta file, such that nextSequence() will read the seekContig from the fasta
* file. If enableBacktracking is false, only allows forward seeks, and throws a RuntimeIOException if the
* seekContig is before the current contig. If enableBacktracking is true, then if seekContig is before
* the current contig, resets the input stream and seeks to the contig.
*
* Requires that the fasta file have a SequenceDictionary associated with it. Otherwises throws an error
*
* @param seekContig The contig I want to seek to
* @param enableBacktracking Should we allow seeks to contigs earlier in the file?
* @return true on success
*/
public boolean seekToContig(final String seekContig, boolean enableBacktracking ) {
if ( DEBUG ) logger.debug(String.format("seekToContig( %s, %b )%n", seekContig, enableBacktracking));
String curContig = getContigName();
String nextContig = null;
if ( curContig == null ) {
logger.info(String.format("CurrentContig is null"));
if ( this.sequenceDictionary == null )
throw new PicardException( String.format("Seeking within contigs requires FASTA dictionary, but none was available for %s", this.file ));
// We only reach this point when we're seeking before we've read in any of the fasta file,
// so assume we are at the start of the file
nextContig = this.sequenceDictionary.getSequence(0).getSequenceName();
}
else
nextContig = getNextContigName();
if ( nextContig == null ) // we're are at the end of the stream
return false;
// we have already read in the current contig, we are jumping from the next contig onwards
long dist = getDistanceBetweenContigs(nextContig, seekContig);
if ( dist == Integer.MAX_VALUE )
return false; // we don't know where to go
else if ( dist == 0 )
return true; // We already here!
else if ( dist < 0 ) {
if ( enableBacktracking ) {
// System.out.printf("*** Backtracking to %s%n", seekContig);
// restart from the beginning, and try again
initializeInputStream();
return seekToContig(seekContig, enableBacktracking);
} else
return false; // we're not going backwards just yet
}
else {
if ( DEBUG ) logger.debug(String.format("Going to seek to contig %s with skip %d%n", seekContig, dist));
// we're actually going to jump somewhere, so prepare the state
this.nextContigName = null; // reset the contig info
// TODO: this is a dangerous method -- can we get access to the underlying file object seek?
long bytesToSkip = dist;
while ( bytesToSkip > 0 ) {
try {
final long skipped = this.in.skip(bytesToSkip);
bytesToSkip -= skipped;
// System.out.printf(" -> skipping %d, %d remaining%n", skipped, bytesToSkip);
} catch (IOException ioe) {
throw new PicardException("Error reading from file: " + this.file.getAbsolutePath(), ioe);
}
}
if ( bytesToSkip != 0 ) { // skip dist bytes
throw new PicardException(String.format("Failed to skip all of the %d bases requested, only got %d", dist, dist - bytesToSkip * 2));
}
// at this point we're ready to start looking for the next header, so call seekNextContigName()
final String next = seekForNextContig(seekContig);
if ( ! next.equals(seekContig) ) // OMG, what the hell happened, throw a runtime exception
throw new PicardException(String.format("Failed to seek from %s to %s, ended up at %s",
curContig, seekContig, next));
else {
this.currentContigName = next;
return true;
}
}
}
/**
* Reads the next contig from the fasta file, and returns it as a ReferenceSequence.
*
* @return null if there are no more sequences in the fasta stream
*/
public ReferenceSequence nextSequence() {
if ( DEBUG ) logger.debug(String.format("Calling nextSequence()%n"));
// Read the header line
currentContigName = getNextContigName();
if ( currentContigName == null ) return null; // no more sequences!
int index = -1;
if ( this.sequenceDictionary != null )
index = this.sequenceDictionary.getSequenceIndex(currentContigName);
// Read the sequence
byte[] tmp = new byte[4096];
int basesRead;
int totalBasesRead = 0;
final int knownLength = (index == -1) ? -1 : this.sequenceDictionary.getSequence(index).getSequenceLength();
final int lengthByteArray = (knownLength != -1) ? knownLength : 250000000;
byte[] bases = new byte[lengthByteArray];
while ((basesRead = readNextLine(bases, totalBasesRead)) != 0) {
totalBasesRead += basesRead;
// Make sure we'll have space for the next iteration if we need it
if (totalBasesRead == knownLength) {
//System.out.printf("Read bases: %s%n", StringUtil.bytesToString(bases, totalBasesRead - basesRead, basesRead).trim());
assert peekOneByte() == -1 || peekOneByte() == '>' : "We somehow managed to read in enough bytes for the contig, but didn't pass through the entire contig";
break;
} else {
final byte b = peekOneByte();
if (b == -1 || b == '>') {
break;
}
else if (totalBasesRead == bases.length) {
tmp = new byte[bases.length * 2];
System.arraycopy(bases, 0, tmp, 0, totalBasesRead);
bases = tmp;
tmp = null;
}
}
}
// And lastly resize the array down to the right size
if (totalBasesRead != bases.length) {
tmp = new byte[totalBasesRead];
System.arraycopy(bases, 0, tmp, 0, totalBasesRead);
bases = tmp;
tmp = null;
}
assert knownLength == -1 || knownLength == bases.length;
this.nextContigName = null; // we no longer know what the next contig name is
if ( DEBUG ) logger.debug(String.format(" => nextSequence() is returning %s, known length = %d%n", this.currentContigName, knownLength));
if ( DEBUG ) logger.debug(String.format(" => nextSequence() next is %s%n", this.getNextContigName()));
return new ReferenceSequence(currentContigName, index, bases);
}
/**
* Returns the next of the next contig, or null if there are no more contigs. Stateful function, it
* remembers the name of the next contig. Use readNextContigName for stateless operation. This function
* is primarily useful if you need to know what the next contig is in the stream for algorithmic purposes.
*
* Note that this function assumes the stream is sitting right at the end of the previous contig, or at the
* beginning of the file.
*
* @return the name of the next contig, or null if there is no next contig
*/
public String getNextContigName() {
if ( DEBUG ) logger.debug(String.format("getNextContigName() => %s%n", this.nextContigName));
if ( this.nextContigName == null ) {
// If it's not null, we've already looked up the next contig name, just return it and happily continue
// Otherwise we need to actually read in the name
this.nextContigName = readNextContigName();
}
if ( DEBUG ) logger.debug(String.format("nextContigName is now %s%n", nextContigName));
return this.nextContigName;
}
/**
* Simply reads the next contig name from the fasta input stream. It assumes that the stream is positioned
* immediately before the start of the next contig. Calling it without this condition met results in the
* RuntimeIOException being thrown. This method advances the fasta stream itself -- subsequent calls to the
* method will lead to errors. getNextContigName is the stateful version.
*
* @See getNextContigName()
*
* @return the string name of the next contig
*/
private String readNextContigName() {
// Otherwise we need to actually read in the name
byte[] tmp = new byte[4096];
final int nameLength = readNextLine(tmp, 0);
if (nameLength != 0) {
// 0 means no more sequences!
if ( tmp[0] != '>' )
throw new RuntimeIOException("The next line is supposed to be a fasta contig start but found " + StringUtil.bytesToString(tmp, 0, nameLength).trim());
return StringUtil.bytesToString(tmp, 1, nameLength).trim();
}
return null;
}
/**
* @return The name of the contig we returned in the last call to nextSequence()
*/
public String getContigName() {
return this.currentContigName;
}
/**
* Moves the IO stream to right before the next contig marker in the fasta file, for anywhere inside the
* previous contig. It is primarily useful as a supplementary routine for jumping forward in the file by
* N bases, since we don't know exactly how far way N bases in bytes will be in the file. So a guess jump
* will put us somewhere before the target contig, and we use this routine to seek forward to the actual
* contig we want.
*
* @return the name of the next contig, as a string
*/
public String seekForNextContig(final String targetContig ) {
//System.out.printf("seekForNextContig()%n");
int basesRead;
int totalBasesRead = 0;
byte[] bases = new byte[4096];
int i = 0;
while ((basesRead = readNextLine(bases, 0)) != 0) {
totalBasesRead += basesRead;
// Keep looking for the > marking the start of the line, and stop
final byte b = peekOneByte();
if (b == -1 || b == '>') {
final String foundContig = readNextContigName();
// System.out.printf("Found a contig name line %s%n", foundContig);
final int foundIndex = this.sequenceDictionary.getSequenceIndex(foundContig);
final int ourIndex = this.sequenceDictionary.getSequenceIndex(targetContig);
if ( foundIndex == ourIndex ) {
// we found our target!
this.nextContigName = foundContig; // store the right answer
if ( DEBUG ) logger.debug(String.format("seekForNextContig found %s%n", foundContig));
return foundContig;
}
else if ( foundIndex <= ourIndex )
// we are still looking for our contig, the seek estimate was inaccurate relative to the size of contings in this area
continue;
else {
// This is really bad -- we are past our target
throw new RuntimeIOException(String.format("Seek pushes us past our target contig of %s, instead we found %s, which is after the target in the sequence dictions", targetContig, foundContig));
}
}
}
return null;
}
/** Peeks one non line-terminating byte from the file and returns it. */
private byte peekOneByte() {
try {
this.in.mark(16);
byte b = '\n';
while (b == '\n' || b == '\r') {
b = (byte) this.in.read();
}
this.in.reset();
return b;
}
catch (IOException ioe) {
throw new PicardException("Error reading from file: " + this.file.getAbsolutePath(), ioe);
}
}
/**
* Reads the next line from the file and puts the bytes into the buffer starting at
* the provided index. Stops when the buffer is full, a line terminator is hit or
* the end of the file is hit.
*/
private int readNextLine(final byte[] buffer, final int index) {
try {
int next;
int read = 0;
while ((next = this.in.read()) != -1 && index + read < buffer.length) {
final byte b = (byte) next;
if (b == '\r' || b == '\n') {
if (read != 0) return read;
}
else {
buffer[index + read++] = b;
}
}
return read;
}
catch (IOException ioe) {
throw new PicardException("Error reading line from file: " + this.file.getAbsolutePath(), ioe);
}
}
/** Returns the full path to the reference file. */
public String toString() {
return this.file.getAbsolutePath();
}
}

View File

@ -1,6 +1,7 @@
package org.broadinstitute.sting.gatk.datasources.simpleDataSources;
import static junit.framework.Assert.fail;
import net.sf.picard.reference.ReferenceSequenceFile;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.gatk.datasources.shards.Shard;
@ -9,12 +10,13 @@ import org.broadinstitute.sting.gatk.datasources.shards.ShardStrategyFactory;
import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
import org.broadinstitute.sting.gatk.Reads;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.fasta.FastaSequenceFile2;
import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;
import java.io.File;
import java.io.FileNotFoundException;
import java.util.ArrayList;
import java.util.List;
@ -47,7 +49,7 @@ import java.util.List;
public class SAMBAMDataSourceTest extends BaseTest {
private List<File> fl;
private FastaSequenceFile2 seq;
private ReferenceSequenceFile seq;
/**
* This function does the setup of our parser, before each method call.
@ -55,11 +57,11 @@ public class SAMBAMDataSourceTest extends BaseTest {
* Called before every test case method.
*/
@Before
public void doForEachTest() {
public void doForEachTest() throws FileNotFoundException {
fl = new ArrayList<File>();
// sequence
seq = new FastaSequenceFile2(new File(seqLocation + "/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta"));
seq = new IndexedFastaSequenceFile(new File(seqLocation + "/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta"));
GenomeLocParser.setupRefContigOrdering(seq.getSequenceDictionary());
}

View File

@ -2,6 +2,7 @@ package org.broadinstitute.sting.gatk.iterators;
import static junit.framework.Assert.fail;
import net.sf.samtools.SAMRecord;
import net.sf.picard.reference.ReferenceSequenceFile;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.gatk.datasources.shards.Shard;
import org.broadinstitute.sting.gatk.datasources.shards.ShardStrategy;
@ -10,13 +11,14 @@ import org.broadinstitute.sting.gatk.datasources.simpleDataSources.SAMDataSource
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.SimpleDataSourceLoadException;
import org.broadinstitute.sting.gatk.Reads;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.fasta.FastaSequenceFile2;
import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import org.junit.Before;
import org.junit.Test;
import java.io.File;
import java.io.FileNotFoundException;
import java.util.ArrayList;
import java.util.List;
@ -60,7 +62,7 @@ public class BoundedReadIteratorTest extends BaseTest {
/** the file list and the fasta sequence */
private List<File> fl;
private FastaSequenceFile2 seq;
private ReferenceSequenceFile seq;
/**
* This function does the setup of our parser, before each method call.
@ -68,11 +70,11 @@ public class BoundedReadIteratorTest extends BaseTest {
* Called before every test case method.
*/
@Before
public void doForEachTest() {
public void doForEachTest() throws FileNotFoundException {
fl = new ArrayList<File>();
// sequence
seq = new FastaSequenceFile2(new File(seqLocation + "/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta"));
seq = new IndexedFastaSequenceFile(new File(seqLocation + "/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta"));
GenomeLocParser.setupRefContigOrdering(seq.getSequenceDictionary());
}

View File

@ -7,7 +7,7 @@ package org.broadinstitute.sting.gatk.refdata;
import org.junit.*;
import static org.junit.Assert.assertTrue;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.fasta.FastaSequenceFile2;
import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile;
import org.broadinstitute.sting.utils.GenomeLocParser;
import java.io.File;
@ -17,20 +17,22 @@ import java.io.FileNotFoundException;
import java.util.Arrays;
import java.util.ArrayList;
import net.sf.picard.reference.ReferenceSequenceFile;
/**
* Basic unit test for TabularROD
*
*/
public class TabularRODTest extends BaseTest {
private static FastaSequenceFile2 seq;
private static ReferenceSequenceFile seq;
private ReferenceOrderedData ROD;
private RODIterator iter;
@BeforeClass
public static void init() {
public static void init() throws FileNotFoundException {
// sequence
seq = new FastaSequenceFile2(new File(seqLocation + "/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta"));
seq = new IndexedFastaSequenceFile(new File(seqLocation + "/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta"));
GenomeLocParser.setupRefContigOrdering(seq);
}

View File

@ -12,7 +12,6 @@ import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
import org.broadinstitute.sting.gatk.walkers.CountReadsWalker;
import org.broadinstitute.sting.gatk.walkers.Walker;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.fasta.FastaSequenceFile2;
import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile;
import static org.junit.Assert.fail;
import org.junit.Before;
@ -26,6 +25,8 @@ import java.lang.reflect.Field;
import java.util.ArrayList;
import java.util.List;
import net.sf.picard.reference.ReferenceSequenceFile;
/**
*
* User: aaron
@ -54,7 +55,7 @@ import java.util.List;
*/
public class TraverseReadsTest extends BaseTest {
private FastaSequenceFile2 seq;
private ReferenceSequenceFile seq;
private File bam = new File("/humgen/gsa-scr1/GATK_Data/Validation_Data/index_test.bam"); // TCGA-06-0188.aligned.duplicates_marked.bam");
private File refFile = new File("/humgen/gsa-scr1/GATK_Data/Validation_Data/Homo_sapiens_assembly17.fasta");
private List<File> bamList;

View File

@ -8,20 +8,23 @@ import org.junit.Assert;
import org.junit.BeforeClass;
import org.junit.Test;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.fasta.FastaSequenceFile2;
import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile;
import java.io.File;
import java.io.FileNotFoundException;
import net.sf.picard.reference.ReferenceSequenceFile;
/**
* Basic unit test for GenomeLoc
*/
public class GenomeLocTest extends BaseTest {
private static FastaSequenceFile2 seq;
private static ReferenceSequenceFile seq;
@BeforeClass
public static void init() {
public static void init() throws FileNotFoundException {
// sequence
seq = new FastaSequenceFile2(new File(seqLocation + "/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta"));
seq = new IndexedFastaSequenceFile(new File(seqLocation + "/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta"));
GenomeLocParser.setupRefContigOrdering(seq);
}

View File

@ -5,16 +5,19 @@ package org.broadinstitute.sting.utils;
// the imports for unit testing.
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.fasta.FastaSequenceFile2;
import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile;
import static org.junit.Assert.assertTrue;
import org.junit.Before;
import org.junit.BeforeClass;
import org.junit.Test;
import java.io.File;
import java.io.FileNotFoundException;
import java.util.Arrays;
import java.util.List;
import net.sf.picard.reference.ReferenceSequenceFile;
/**
* Basic unit test for RefHanger
*
@ -50,7 +53,7 @@ import java.util.List;
*/
public class RefHangerTest extends BaseTest {
private static FastaSequenceFile2 seq;
private static ReferenceSequenceFile seq;
private GenomeLoc startLoc;
private RefHanger<Integer> emptyHanger;
@ -67,9 +70,9 @@ public class RefHangerTest extends BaseTest {
private static GenomeLoc p1, p2, p3, p4, p5;
@BeforeClass
public static void init() {
public static void init() throws FileNotFoundException {
// sequence
seq = new FastaSequenceFile2(new File(seqLocation + "/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta"));
seq = new IndexedFastaSequenceFile(new File(seqLocation + "/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta"));
GenomeLocParser.setupRefContigOrdering(seq);
System.out.printf("Filled hanger is %n%s%n", makeFilledHanger());

View File

@ -1,197 +0,0 @@
package org.broadinstitute.sting.utils.fasta;
import net.sf.picard.reference.ReferenceSequence;
import net.sf.samtools.util.StringUtil;
import org.broadinstitute.sting.BaseTest;
import org.junit.*;
import java.io.File;
/**
* Created by IntelliJ IDEA.
* User: mhanna
* Date: Apr 11, 2009
* Time: 2:32:52 PM
*/
public class FastaSequenceFile2Test extends BaseTest {
private static String sequenceFileName;
private FastaSequenceFile2 sequenceFile = null;
private final String firstBasesOfChrM = "GATCACAGGTCTATCACCCT";
private final String firstBasesOfChr1 = "taaccctaaccctaacccta";
private final String firstBasesOfChr8 = "GCAATTATGACACAAAAAAT";
@BeforeClass
public static void initialize() {
sequenceFileName = seqLocation + "/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta";
}
@Before
public void doForEachTest() {
sequenceFile = new FastaSequenceFile2( new File(sequenceFileName) );
}
/**
* Tears down the test fixture after each call.
* <p/>
* Called after every test case method.
*/
@After
public void undoForEachTest() {
sequenceFile = null;
}
@Test
public void testOpenFile() {
logger.warn("Executing testOpenFile");
long startTime = System.currentTimeMillis();
Assert.assertNotNull( sequenceFile );
long endTime = System.currentTimeMillis();
System.err.printf("testOpenFile runtime: %dms%n", (endTime - startTime)) ;
}
@Test
public void testFirstSequence() {
logger.warn("Executing testFirstSequence");
long startTime = System.currentTimeMillis();
ReferenceSequence sequence = sequenceFile.nextSequence();
Assert.assertEquals("First sequence contig is not correct", sequence.getName(), "chrM");
Assert.assertEquals( "First n bases of chrM are incorrect",
StringUtil.bytesToString( sequence.getBases(), 0, firstBasesOfChrM.length() ),
firstBasesOfChrM );
long endTime = System.currentTimeMillis();
System.err.printf("testFirstSequence runtime: %dms%n", (endTime - startTime)) ;
}
@Test
public void testNextSequence() {
logger.warn("Executing testNextSequence");
long startTime = System.currentTimeMillis();
ReferenceSequence sequence = null;
// Advance to chrM.
sequence = sequenceFile.nextSequence();
sequence = sequenceFile.nextSequence();
Assert.assertEquals("First sequence contig is not correct", sequence.getName(), "chr1");
// Workaround: bytesToString for chr1 of the fasta file we've picked doesn't appear to work.
// TODO: Report this as sam-jdk bug.
byte[] firstOfChr1 = StringUtil.stringToBytes(firstBasesOfChr1);
byte[] firstOfSequence = new byte[firstBasesOfChr1.length()];
System.arraycopy(sequence.getBases(), 0, firstOfSequence, 0, firstOfSequence.length );
Assert.assertArrayEquals("First bases of chr1 are not correct", firstOfChr1, firstOfSequence );
long endTime = System.currentTimeMillis();
System.err.printf("testNextSequence runtime: %dms%n", (endTime - startTime)) ;
}
@Test
public void testSeekToSequence() {
logger.warn("Executing testSeekToSequence");
long startTime = System.currentTimeMillis();
boolean success = sequenceFile.seekToContig("chr8");
Assert.assertTrue("Seek to seq chr8 failed", success );
ReferenceSequence sequence = sequenceFile.nextSequence();
Assert.assertEquals("First sequence contig is not correct", sequence.getName(), "chr8");
Assert.assertEquals( "First n bases of chrc are incorrect",
StringUtil.bytesToString( sequence.getBases(), 0, firstBasesOfChr8.length() ),
firstBasesOfChr8 );
long endTime = System.currentTimeMillis();
System.err.printf("testSeekToSequence runtime: %dms%n", (endTime - startTime)) ;
}
// TODO: Is NullPointerException *really* the right exception when a sequence is missing?
@Test(expected=NullPointerException.class)
public void testSeekToMissingSequence() {
logger.warn("Executing testSeekToMissingSequence");
long startTime = 0L, endTime = 0L;
try {
startTime = System.currentTimeMillis();
boolean success = sequenceFile.seekToContig("absent");
}
finally {
endTime = System.currentTimeMillis();
System.err.printf("testSeekToMissingSequence runtime: %dms%n", (endTime - startTime)) ;
}
}
@Test
public void testSeekBackward() {
logger.warn("Executing testSeekBackward");
long startTime = System.currentTimeMillis();
boolean success = sequenceFile.seekToContig("chr9");
Assert.assertTrue("Unable to seek to contig 'chr9'", success);
success = sequenceFile.seekToContig("chr8",true);
Assert.assertTrue("Unable to seek backward to contig 'chr8'", success);
ReferenceSequence sequence = sequenceFile.nextSequence();
Assert.assertEquals("First sequence contig is not correct", sequence.getName(), "chr8");
Assert.assertEquals( "First n bases of chrc are incorrect",
StringUtil.bytesToString( sequence.getBases(), 0, firstBasesOfChr8.length() ),
firstBasesOfChr8 );
long endTime = System.currentTimeMillis();
System.err.printf("testSeekBackward runtime: %dms%n", (endTime - startTime)) ;
}
@Test
public void testInvalidSeekBackward() {
logger.warn("Executing testInvalidSeekBackward");
long startTime = System.currentTimeMillis();
boolean success = sequenceFile.seekToContig("chr9");
Assert.assertTrue("Unable to seek to contig 'chr9'", success);
success = sequenceFile.seekToContig("chr8");
Assert.assertFalse("Unable to seek backward to contig 'chr8'", success);
long endTime = System.currentTimeMillis();
System.err.printf("testInvalidSeekBackward runtime: %dms%n", (endTime - startTime)) ;
}
@Test
public void testSimultaneousAccess() {
logger.warn("Executing testSimultaneousAccess");
long startTime = System.currentTimeMillis();
// FastaSequenceFile2 other = (FastaSequenceFile2)sequenceFile.clone();
sequenceFile.seekToContig("chr1");
ReferenceSequence chr1 = sequenceFile.nextSequence();
// other.seekToContig("chr8");
// ReferenceSequence chr8 = other.nextSequence();
// System.err.printf( "sequenceFile contig: %s%n", sequenceFile.getContigName() );
// System.err.printf( "other contig: %s%n", other.getContigName() );
long endTime = System.currentTimeMillis();
System.err.printf("testSimultaneousAccess runtime: %dms%n", (endTime - startTime)) ;
}
}

View File

@ -10,6 +10,8 @@ import java.io.File;
import java.io.FileNotFoundException;
import net.sf.picard.reference.ReferenceSequence;
import net.sf.picard.reference.ReferenceSequenceFile;
import net.sf.picard.reference.ReferenceSequenceFileFactory;
import net.sf.picard.PicardException;
import net.sf.samtools.util.StringUtil;
@ -122,7 +124,7 @@ public class IndexedFastaSequenceFileTest extends BaseTest {
@Test
public void testFirstCompleteContigRead() {
FastaSequenceFile2 originalSequenceFile = new FastaSequenceFile2(new File(sequenceFileName));
ReferenceSequenceFile originalSequenceFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(new File(sequenceFileName));
ReferenceSequence expectedSequence = originalSequenceFile.nextSequence();
long startTime = System.currentTimeMillis();
@ -164,9 +166,10 @@ public class IndexedFastaSequenceFileTest extends BaseTest {
@Test
public void testMiddleCompleteContigRead() {
FastaSequenceFile2 originalSequenceFile = new FastaSequenceFile2(new File(sequenceFileName));
originalSequenceFile.seekToContig("chrY");
ReferenceSequenceFile originalSequenceFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(new File(sequenceFileName));
ReferenceSequence expectedSequence = originalSequenceFile.nextSequence();
while( !expectedSequence.getName().equals("chrY") )
expectedSequence = originalSequenceFile.nextSequence();
long startTime = System.currentTimeMillis();
ReferenceSequence sequence = sequenceFile.getSequence("chrY");
@ -183,9 +186,10 @@ public class IndexedFastaSequenceFileTest extends BaseTest {
@Test
public void testLastCompleteContigRead() {
FastaSequenceFile2 originalSequenceFile = new FastaSequenceFile2(new File(sequenceFileName));
originalSequenceFile.seekToContig("chrX_random");
ReferenceSequenceFile originalSequenceFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(new File(sequenceFileName));
ReferenceSequence expectedSequence = originalSequenceFile.nextSequence();
while( !expectedSequence.getName().equals("chrX_random") )
expectedSequence = originalSequenceFile.nextSequence();
long startTime = System.currentTimeMillis();
ReferenceSequence sequence = sequenceFile.getSequence("chrX_random");
@ -233,7 +237,7 @@ public class IndexedFastaSequenceFileTest extends BaseTest {
@Test
public void testFirstElementOfIterator() {
FastaSequenceFile2 originalSequenceFile = new FastaSequenceFile2(new File(sequenceFileName));
ReferenceSequenceFile originalSequenceFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(new File(sequenceFileName));
ReferenceSequence expectedSequence = originalSequenceFile.nextSequence();
long startTime = System.currentTimeMillis();
@ -251,7 +255,7 @@ public class IndexedFastaSequenceFileTest extends BaseTest {
@Test
public void testNextElementOfIterator() {
FastaSequenceFile2 originalSequenceFile = new FastaSequenceFile2(new File(sequenceFileName));
ReferenceSequenceFile originalSequenceFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(new File(sequenceFileName));
// Skip past the first one and load the second one.
originalSequenceFile.nextSequence();
ReferenceSequence expectedSequence = originalSequenceFile.nextSequence();