Cleanup...deprecate FastaSequenceFile2 in favor of IndexedFastaSequenceFile or ReferenceSequenceFile from Picard, depending on the application.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1196 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
0a67386525
commit
433ad1f060
|
|
@ -1,7 +1,6 @@
|
|||
package org.broadinstitute.sting.gatk.iterators;
|
||||
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.fasta.FastaSequenceFile2;
|
||||
import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile;
|
||||
|
||||
import java.util.Iterator;
|
||||
|
|
|
|||
|
|
@ -1,213 +0,0 @@
|
|||
package org.broadinstitute.sting.gatk.iterators;
|
||||
|
||||
import net.sf.picard.reference.ReferenceSequence;
|
||||
import net.sf.samtools.util.RuntimeIOException;
|
||||
import net.sf.samtools.util.StringUtil;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.fasta.FastaSequenceFile2;
|
||||
|
||||
import java.util.Iterator;
|
||||
import java.util.NoSuchElementException;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: depristo
|
||||
* Date: Feb 24, 2009
|
||||
* Time: 10:45:01 AM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
public class ReferenceIterator implements Iterator<ReferenceIterator> {
|
||||
|
||||
// enable debugging output?
|
||||
private final boolean DEBUG = false;
|
||||
|
||||
// The reference sequence file generator
|
||||
private FastaSequenceFile2 refFile;
|
||||
|
||||
private ReferenceSequence currentContig = null;
|
||||
//private ReferenceSequence nextContig = null;
|
||||
private long offset = -1;
|
||||
|
||||
public ReferenceIterator(FastaSequenceFile2 refFile) {
|
||||
this.refFile = refFile;
|
||||
}
|
||||
|
||||
/** our log, which we want to capture anything from this class */
|
||||
private static Logger logger = Logger.getLogger(ReferenceIterator.class);
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// Accessing data
|
||||
//
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
public byte getBaseAsByte() {
|
||||
return currentContig.getBases()[(int) offset];
|
||||
}
|
||||
|
||||
public String getBaseAsString() {
|
||||
assert offset > -1 : currentContig.getName() + " index is " + offset;
|
||||
//assert offset < currentContig.getBases().();
|
||||
|
||||
return StringUtil.bytesToString(currentContig.getBases(), (int) offset, 1);
|
||||
}
|
||||
|
||||
public char getBaseAsChar() {
|
||||
return getBaseAsString().charAt(0);
|
||||
}
|
||||
|
||||
public ReferenceSequence getCurrentContig() {
|
||||
return currentContig;
|
||||
}
|
||||
|
||||
public long getPosition() {
|
||||
return offset + 1;
|
||||
}
|
||||
|
||||
public GenomeLoc getLocation() {
|
||||
return GenomeLocParser.createGenomeLoc(getCurrentContig().getName(), getPosition());
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// Iterator routines
|
||||
//
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
public synchronized boolean hasNext() {
|
||||
if (currentContig == null || offset + 1 < currentContig.length()) {
|
||||
return true;
|
||||
} else {
|
||||
return readNextContig();
|
||||
}
|
||||
}
|
||||
|
||||
public synchronized ReferenceIterator next() {
|
||||
if (currentContig != null) {
|
||||
if (DEBUG)
|
||||
logger.debug(String.format(" -> %s:%d %d%n", currentContig.getName(), offset, currentContig.length()));
|
||||
}
|
||||
offset++; // move on to the next position
|
||||
|
||||
if (currentContig == null || offset >= currentContig.length()) {
|
||||
// We need to update the contig
|
||||
if (readNextContig()) {
|
||||
// We sucessfully loaded the next contig, recursively call next
|
||||
return next();
|
||||
} else {
|
||||
throw new NoSuchElementException();
|
||||
}
|
||||
} else {
|
||||
// We're good to go -- we're in the current contig
|
||||
return this;
|
||||
}
|
||||
}
|
||||
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// Jumping forward
|
||||
//
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
public ReferenceIterator seekForward(final GenomeLoc loc) {
|
||||
assert loc != null : "seekForward location is null";
|
||||
|
||||
return seekForwardOffset(loc.getContig(), loc.getStart() - 1);
|
||||
}
|
||||
|
||||
public ReferenceIterator seekForward(final String contigName, final long pos) {
|
||||
return seekForwardOffset(contigName, pos - 1);
|
||||
}
|
||||
|
||||
/**
|
||||
* Helper routine that doesn't move the contigs around, it just checks that everything is kosher in the seek
|
||||
* within this chromosome
|
||||
*
|
||||
* @param seekContigName name for printing pursues, asserted to be the current contig name
|
||||
* @param seekOffset where we want to be in this contig
|
||||
* @return this setup to be at seekoffset within seekContigName
|
||||
*/
|
||||
private ReferenceIterator seekForwardOffsetOnSameContig(final String seekContigName, final long seekOffset) {
|
||||
assert seekContigName.equals(currentContig.getName()) : String.format("only works on this contig, but the current %s and sought %s contigs are different!", currentContig.getName(), seekContigName);
|
||||
|
||||
// we're somewhere on this contig
|
||||
if (seekOffset < offset ) {
|
||||
// bad boy -- can't go backward safely
|
||||
throw new IllegalArgumentException(String.format("Invalid seek %s => %s, which is usually due to out of order reads%n",
|
||||
GenomeLocParser.createGenomeLoc(currentContig.getName(), offset), GenomeLocParser.createGenomeLoc(seekContigName, seekOffset)));
|
||||
} else if (seekOffset >= currentContig.length()) {
|
||||
// bad boy -- can't go beyond the contig length
|
||||
throw new IllegalArgumentException(String.format("Invalid seek to %s, which is beyond the end of the contig%n",
|
||||
GenomeLocParser.createGenomeLoc(currentContig.getName(), seekOffset + 1)));
|
||||
} else {
|
||||
offset = seekOffset - 1;
|
||||
return next();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private ReferenceIterator seekForwardOffset(final String seekContigName, final long seekOffset) {
|
||||
assert seekContigName != null : "seekContigName is null";
|
||||
assert seekOffset >= 0 : "seekOffset < 0: " + seekOffset;
|
||||
|
||||
// jumps us forward in the sequence to the contig / pos
|
||||
if (currentContig == null)
|
||||
next();
|
||||
|
||||
if (DEBUG)
|
||||
logger.debug(String.format(" -> Seeking to %s %d from %s %d%n", seekContigName, seekOffset, currentContig.getName(), offset));
|
||||
|
||||
|
||||
int cmpContigs = GenomeLocParser.compareContigs(seekContigName,currentContig.getName());
|
||||
|
||||
if ( cmpContigs < 0 && GenomeLocParser.hasKnownContigOrdering() ) { // if we know the order of contigs and we are already past the contig we seek, it's too late!
|
||||
// The contig we are looking for is before the currentContig -- it's an error
|
||||
throw new IllegalArgumentException(String.format("Invalid seek %s => %s, contigs/sequences are out of order%n",
|
||||
GenomeLocParser.createGenomeLoc(currentContig.getName(), offset), GenomeLocParser.createGenomeLoc(seekContigName, seekOffset)));
|
||||
}
|
||||
|
||||
if ( cmpContigs > 0 || (! GenomeLocParser.hasKnownContigOrdering() ) && cmpContigs != 0 ) { // if contig we seek is still ahead, or if we have no idea what the order is and current contig is not what we seek
|
||||
// then try to seek forward in the reference file until we get the contig we need
|
||||
if (DEBUG)
|
||||
logger.debug(String.format(" -> Seeking in the fasta file to %s from %s%n", seekContigName, currentContig.getName()));
|
||||
|
||||
if (!refFile.seekToContig(seekContigName)) { // ok, do the seek
|
||||
// a false result indicates a failure, throw a somewhat cryptic call
|
||||
throw new RuntimeIOException(String.format("Unexpected seek failure from %s to %s%n",
|
||||
GenomeLocParser.createGenomeLoc(currentContig.getName(), offset), GenomeLocParser.createGenomeLoc(seekContigName, seekOffset)));
|
||||
}
|
||||
|
||||
readNextContig(); // since we haven't failed, we just read in the next contig (which is seekContigName)
|
||||
}
|
||||
|
||||
// at this point, the current contig is seekContigName, so just do a bit more error checking and be done
|
||||
return seekForwardOffsetOnSameContig(seekContigName, seekOffset);
|
||||
}
|
||||
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// Interal state manipulation
|
||||
//
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
protected boolean readNextContig() {
|
||||
// returns true if we had another contig to load
|
||||
currentContig = refFile.nextSequence();
|
||||
offset = -1;
|
||||
return currentContig != null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Simple forwarding method to the refFile itself
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
public String nextContigName() {
|
||||
return this.refFile.getNextContigName();
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -17,7 +17,6 @@ import org.broadinstitute.sting.gatk.datasources.shards.Shard;
|
|||
import org.broadinstitute.sting.gatk.datasources.providers.ShardDataProvider;
|
||||
import org.broadinstitute.sting.gatk.Reads;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.fasta.FastaSequenceFile2;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.*;
|
||||
|
|
@ -54,9 +53,6 @@ public abstract class TraversalEngine {
|
|||
|
||||
// The reference data -- filename, refSeqFile, and iterator
|
||||
protected File refFileName = null; // the name of the reference file
|
||||
//private ReferenceSequenceFile refFile = null;
|
||||
protected FastaSequenceFile2 refFile = null; // todo: merge FastaSequenceFile2 into picard!
|
||||
protected ReferenceIterator refIter = null;
|
||||
|
||||
// Progress tracker for the sam file
|
||||
protected FileProgressTracker samReadingTracker = null;
|
||||
|
|
@ -317,7 +313,6 @@ public abstract class TraversalEngine {
|
|||
*/
|
||||
public boolean initialize() {
|
||||
lastProgressPrintTime = startTime = System.currentTimeMillis();
|
||||
initializeReference();
|
||||
// Initial the reference ordered data iterators
|
||||
initializeRODs();
|
||||
|
||||
|
|
@ -350,21 +345,6 @@ public abstract class TraversalEngine {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Prepare the reference for stream processing
|
||||
*/
|
||||
protected void initializeReference() {
|
||||
if (refFileName != null) {
|
||||
//this.refFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(refFileName);
|
||||
this.refFile = new FastaSequenceFile2(refFileName); // todo: replace when FastaSequenceFile2 is in picard
|
||||
this.refIter = new ReferenceIterator(this.refFile);
|
||||
if (!GenomeLocParser.setupRefContigOrdering(this.refFile)) {
|
||||
// We couldn't process the reference contig ordering, fail since we need it
|
||||
Utils.scareUser(String.format("We couldn't load the contig dictionary associated with %s. At the current time we require this dictionary file to efficiently access the FASTA file. Please use /seq/software/picard/current/bin/CreateSequenceDictionary.jar to create a sequence dictionary for your file", refFileName));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Prepare the list of reference ordered data iterators for each of the rods
|
||||
*
|
||||
|
|
@ -378,17 +358,6 @@ public abstract class TraversalEngine {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* An inappropriately placed testing of reading the reference
|
||||
*/
|
||||
protected void testReference() {
|
||||
while (true) {
|
||||
ReferenceSequence ref = refFile.nextSequence();
|
||||
logger.debug(String.format("%s %d %d", ref.getName(), ref.length(), System.currentTimeMillis()));
|
||||
printProgress(true, "loci", GenomeLocParser.createGenomeLoc("foo", 1));
|
||||
}
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// shutdown
|
||||
|
|
|
|||
|
|
@ -8,9 +8,7 @@ import org.broadinstitute.sting.gatk.refdata.rodGFF;
|
|||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||
import org.broadinstitute.sting.utils.ReadBackedPileup;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.fasta.FastaSequenceFile2;
|
||||
import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import net.sf.picard.reference.ReferenceSequence;
|
||||
|
||||
import java.util.List;
|
||||
|
|
|
|||
|
|
@ -8,7 +8,6 @@ import org.broadinstitute.sting.utils.StingException;
|
|||
import org.broadinstitute.sting.utils.containers.BoundedScoringSet;
|
||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||
import org.broadinstitute.sting.utils.cmdLine.CommandLineProgram;
|
||||
import org.broadinstitute.sting.utils.fasta.FastaSequenceFile2;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.ArrayList;
|
||||
|
|
|
|||
|
|
@ -3,9 +3,11 @@ package org.broadinstitute.sting.secondarybase;
|
|||
import net.sf.samtools.SAMFileReader;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import net.sf.samtools.util.CloseableIterator;
|
||||
import net.sf.picard.reference.ReferenceSequenceFile;
|
||||
import net.sf.picard.reference.ReferenceSequenceFileFactory;
|
||||
import net.sf.picard.reference.ReferenceSequence;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
import org.broadinstitute.sting.utils.fasta.FastaSequenceFile2;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.ArrayList;
|
||||
|
|
@ -159,7 +161,7 @@ public class BasecallingTrainer {
|
|||
* @return a vector of perfect reads, grouped by tile
|
||||
*/
|
||||
private Vector<HashMap<String, SAMRecord>> getPerfectAlignmentsByTile(File samIn, File reference) {
|
||||
FastaSequenceFile2 ref = new FastaSequenceFile2(reference);
|
||||
ReferenceSequenceFile ref = ReferenceSequenceFileFactory.getReferenceSequenceFile(reference);
|
||||
String currentContig = "none";
|
||||
byte[] refbases = null;
|
||||
|
||||
|
|
@ -175,9 +177,11 @@ public class BasecallingTrainer {
|
|||
int offset = sr.getAlignmentStart();
|
||||
|
||||
if (!currentContig.matches(sr.getReferenceName())) {
|
||||
ref.seekToContig(sr.getReferenceName());
|
||||
ReferenceSequence refSeq = ref.nextSequence();
|
||||
while( !refSeq.getName().equals(sr.getReferenceName()) )
|
||||
refSeq = ref.nextSequence();
|
||||
currentContig = sr.getReferenceName();
|
||||
refbases = ref.nextSequence().getBases();
|
||||
refbases = refSeq.getBases();
|
||||
}
|
||||
|
||||
int mismatches = 0;
|
||||
|
|
|
|||
|
|
@ -1,472 +0,0 @@
|
|||
package org.broadinstitute.sting.utils.fasta;
|
||||
|
||||
import net.sf.picard.PicardException;
|
||||
import net.sf.picard.reference.ReferenceSequenceFile;
|
||||
import net.sf.picard.reference.ReferenceSequence;
|
||||
import net.sf.picard.io.IoUtil;
|
||||
|
||||
import java.io.*;
|
||||
|
||||
import net.sf.samtools.SAMTextHeaderCodec;
|
||||
import net.sf.samtools.SAMFileHeader;
|
||||
import net.sf.samtools.SAMSequenceDictionary;
|
||||
import net.sf.samtools.SAMSequenceRecord;
|
||||
import net.sf.samtools.util.AsciiLineReader;
|
||||
import net.sf.samtools.util.StringUtil;
|
||||
import net.sf.samtools.util.RuntimeIOException;
|
||||
import org.apache.log4j.Logger;
|
||||
|
||||
/**
|
||||
* Implementation of ReferenceSequenceFile for reading from FASTA files.
|
||||
*
|
||||
* Now supports additional operations to seek and query the next contig from the file, for efficient
|
||||
* implementation of jumping forward in the file.
|
||||
*
|
||||
* @author Tim Fennell
|
||||
* @author Extended in parts by Mark DePristo
|
||||
*/
|
||||
public class FastaSequenceFile2 implements ReferenceSequenceFile {
|
||||
private final File file;
|
||||
private BufferedInputStream in;
|
||||
private SAMSequenceDictionary sequenceDictionary = null;
|
||||
private String currentContigName = null;
|
||||
|
||||
/**
|
||||
* our log, which we want to capture anything from this class
|
||||
*/
|
||||
private static Logger logger = Logger.getLogger(FastaSequenceFile2.class);
|
||||
|
||||
/**
|
||||
* Set to true to see lots of debugging output during operation
|
||||
*/
|
||||
private final boolean DEBUG = false;
|
||||
|
||||
/**
|
||||
* The name, if known, of the next contig in the file. Can be null, indicating either that there is
|
||||
* no known next contig name, or that the file has been completely read
|
||||
*/
|
||||
private String nextContigName = null;
|
||||
|
||||
/** Constructs a FastaSequenceFile that reads from the specified file. */
|
||||
public FastaSequenceFile2(final File file) {
|
||||
this.file = file;
|
||||
initializeInputStream();
|
||||
|
||||
// Try and locate the dictionary
|
||||
String dictionaryName = file.getAbsolutePath();
|
||||
dictionaryName = dictionaryName.substring(0, dictionaryName.lastIndexOf(".fasta"));
|
||||
dictionaryName += ".dict";
|
||||
final File dictionary = new File(dictionaryName);
|
||||
if (dictionary.exists()) {
|
||||
IoUtil.assertFileIsReadable(dictionary);
|
||||
|
||||
try {
|
||||
final SAMTextHeaderCodec codec = new SAMTextHeaderCodec();
|
||||
final SAMFileHeader header = codec.decode(new AsciiLineReader(new FileInputStream(dictionary)), dictionary);
|
||||
if (header.getSequenceDictionary() != null && header.getSequenceDictionary().size() > 0) {
|
||||
this.sequenceDictionary = header.getSequenceDictionary();
|
||||
}
|
||||
}
|
||||
catch (Exception e) {
|
||||
throw new PicardException("Could not open sequence dictionary file: " + dictionaryName, e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Opens the file for input. Closes the previous input stream if there is one, and sets up all of the key
|
||||
* variables such that this object's state is clean and ready for processing sequences.
|
||||
*/
|
||||
private void initializeInputStream() {
|
||||
// we're at the start and haven't processed anything yet
|
||||
nextContigName = currentContigName = null;
|
||||
|
||||
if ( this.in != null ) { // this isn't our first time here
|
||||
try {
|
||||
this.in.close();
|
||||
} catch ( IOException e ) {
|
||||
throw new RuntimeIOException("initializing InputStream failure", e);
|
||||
}
|
||||
}
|
||||
|
||||
// Now reopen the input stream
|
||||
this.in = new BufferedInputStream(IoUtil.openFileForReading(file));
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the list of sequence records associated with the reference sequence if found
|
||||
* otherwise null.
|
||||
*/
|
||||
public SAMSequenceDictionary getSequenceDictionary() {
|
||||
return this.sequenceDictionary;
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// Support functions for seeking around in the file
|
||||
//
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Returns the distance, in bp, between contig1 and contig2 according to this fasta file's dictionary. That is,
|
||||
* the number of bp we'd have to traverse to move from the start of contig1 to reach the start of contig2.
|
||||
*
|
||||
* If contig1 occurs before contig2, a negative number is returned. 0 indicates that the contigs are the same.
|
||||
*
|
||||
* Returns Integer.MAX_VALUE if the sequence dictionary cannot be found
|
||||
*
|
||||
* @param contig1
|
||||
* @param contig2
|
||||
* @return distance in bp from the start of contig1 to the start of contig2
|
||||
*/
|
||||
public long getDistanceBetweenContigs(final String contig1, final String contig2) {
|
||||
assert contig1 != null;
|
||||
assert contig2 != null;
|
||||
|
||||
final SAMSequenceDictionary seqDict = getSequenceDictionary();
|
||||
|
||||
if ( seqDict == null ) // we couldn't load the reference dictionary
|
||||
return Integer.MAX_VALUE;
|
||||
|
||||
SAMSequenceRecord contig1Rec = seqDict.getSequence(contig1);
|
||||
SAMSequenceRecord contig2Rec = seqDict.getSequence(contig2);
|
||||
|
||||
assert contig1Rec != null : "Contig1 record is null: " + contig1;
|
||||
assert contig1Rec != null : "Contig2 record is null: " + contig2;
|
||||
|
||||
if ( DEBUG )
|
||||
logger.debug(String.format("Contig1=(%s, %d), contig2=(%s, %d)%n",
|
||||
contig1, contig1Rec.getSequenceIndex(),
|
||||
contig2, contig2Rec.getSequenceIndex()));
|
||||
|
||||
int startIndex = Math.min(contig1Rec.getSequenceIndex(), contig2Rec.getSequenceIndex());
|
||||
int lastIndex = Math.max(contig1Rec.getSequenceIndex(), contig2Rec.getSequenceIndex());
|
||||
|
||||
long bytesToTraverse = 0;
|
||||
for ( int i = startIndex; i < lastIndex; i++ ) {
|
||||
SAMSequenceRecord rec = seqDict.getSequence(i);
|
||||
bytesToTraverse += rec.getSequenceLength();
|
||||
if ( DEBUG )
|
||||
logger.debug(String.format(" -> Traversing from %15s to %15s requires reading at least %10d bytes to pass contig %15s, total bytes %10d%n",
|
||||
contig1, contig2, rec.getSequenceLength(), rec.getSequenceName(), bytesToTraverse));
|
||||
}
|
||||
|
||||
if ( contig1Rec.getSequenceIndex() > contig2Rec.getSequenceIndex() )
|
||||
bytesToTraverse *= -1; // we are going backward!
|
||||
|
||||
if ( DEBUG ) logger.debug(String.format(" -> total distance is %d%n", bytesToTraverse));
|
||||
|
||||
return bytesToTraverse;
|
||||
}
|
||||
|
||||
/**
|
||||
* Seeks to seekContig in the fasta file, such that nextSequence() will read the seekContig from the fasta
|
||||
* file. Only allows forward seeks. Throws a RuntimeIOException if the seekContig is before the current
|
||||
* contig.
|
||||
*
|
||||
* @param seekContig the contig I want to seek to
|
||||
* @return true on success
|
||||
*
|
||||
* @see #seekToContig(String)
|
||||
*/
|
||||
public boolean seekToContig( final String seekContig ) {
|
||||
return seekToContig(seekContig, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* Seeks to seekContig in the fasta file, such that nextSequence() will read the seekContig from the fasta
|
||||
* file. If enableBacktracking is false, only allows forward seeks, and throws a RuntimeIOException if the
|
||||
* seekContig is before the current contig. If enableBacktracking is true, then if seekContig is before
|
||||
* the current contig, resets the input stream and seeks to the contig.
|
||||
*
|
||||
* Requires that the fasta file have a SequenceDictionary associated with it. Otherwises throws an error
|
||||
*
|
||||
* @param seekContig The contig I want to seek to
|
||||
* @param enableBacktracking Should we allow seeks to contigs earlier in the file?
|
||||
* @return true on success
|
||||
*/
|
||||
public boolean seekToContig(final String seekContig, boolean enableBacktracking ) {
|
||||
if ( DEBUG ) logger.debug(String.format("seekToContig( %s, %b )%n", seekContig, enableBacktracking));
|
||||
|
||||
String curContig = getContigName();
|
||||
String nextContig = null;
|
||||
|
||||
if ( curContig == null ) {
|
||||
logger.info(String.format("CurrentContig is null"));
|
||||
if ( this.sequenceDictionary == null )
|
||||
throw new PicardException( String.format("Seeking within contigs requires FASTA dictionary, but none was available for %s", this.file ));
|
||||
|
||||
// We only reach this point when we're seeking before we've read in any of the fasta file,
|
||||
// so assume we are at the start of the file
|
||||
nextContig = this.sequenceDictionary.getSequence(0).getSequenceName();
|
||||
}
|
||||
else
|
||||
nextContig = getNextContigName();
|
||||
|
||||
if ( nextContig == null ) // we're are at the end of the stream
|
||||
return false;
|
||||
|
||||
// we have already read in the current contig, we are jumping from the next contig onwards
|
||||
long dist = getDistanceBetweenContigs(nextContig, seekContig);
|
||||
|
||||
if ( dist == Integer.MAX_VALUE )
|
||||
return false; // we don't know where to go
|
||||
else if ( dist == 0 )
|
||||
return true; // We already here!
|
||||
else if ( dist < 0 ) {
|
||||
if ( enableBacktracking ) {
|
||||
// System.out.printf("*** Backtracking to %s%n", seekContig);
|
||||
// restart from the beginning, and try again
|
||||
initializeInputStream();
|
||||
return seekToContig(seekContig, enableBacktracking);
|
||||
} else
|
||||
return false; // we're not going backwards just yet
|
||||
}
|
||||
else {
|
||||
if ( DEBUG ) logger.debug(String.format("Going to seek to contig %s with skip %d%n", seekContig, dist));
|
||||
// we're actually going to jump somewhere, so prepare the state
|
||||
this.nextContigName = null; // reset the contig info
|
||||
|
||||
// TODO: this is a dangerous method -- can we get access to the underlying file object seek?
|
||||
long bytesToSkip = dist;
|
||||
while ( bytesToSkip > 0 ) {
|
||||
try {
|
||||
final long skipped = this.in.skip(bytesToSkip);
|
||||
bytesToSkip -= skipped;
|
||||
// System.out.printf(" -> skipping %d, %d remaining%n", skipped, bytesToSkip);
|
||||
} catch (IOException ioe) {
|
||||
throw new PicardException("Error reading from file: " + this.file.getAbsolutePath(), ioe);
|
||||
}
|
||||
}
|
||||
|
||||
if ( bytesToSkip != 0 ) { // skip dist bytes
|
||||
throw new PicardException(String.format("Failed to skip all of the %d bases requested, only got %d", dist, dist - bytesToSkip * 2));
|
||||
}
|
||||
|
||||
// at this point we're ready to start looking for the next header, so call seekNextContigName()
|
||||
final String next = seekForNextContig(seekContig);
|
||||
|
||||
if ( ! next.equals(seekContig) ) // OMG, what the hell happened, throw a runtime exception
|
||||
throw new PicardException(String.format("Failed to seek from %s to %s, ended up at %s",
|
||||
curContig, seekContig, next));
|
||||
else {
|
||||
this.currentContigName = next;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads the next contig from the fasta file, and returns it as a ReferenceSequence.
|
||||
*
|
||||
* @return null if there are no more sequences in the fasta stream
|
||||
*/
|
||||
public ReferenceSequence nextSequence() {
|
||||
if ( DEBUG ) logger.debug(String.format("Calling nextSequence()%n"));
|
||||
|
||||
// Read the header line
|
||||
currentContigName = getNextContigName();
|
||||
if ( currentContigName == null ) return null; // no more sequences!
|
||||
|
||||
int index = -1;
|
||||
if ( this.sequenceDictionary != null )
|
||||
index = this.sequenceDictionary.getSequenceIndex(currentContigName);
|
||||
|
||||
// Read the sequence
|
||||
byte[] tmp = new byte[4096];
|
||||
int basesRead;
|
||||
int totalBasesRead = 0;
|
||||
final int knownLength = (index == -1) ? -1 : this.sequenceDictionary.getSequence(index).getSequenceLength();
|
||||
final int lengthByteArray = (knownLength != -1) ? knownLength : 250000000;
|
||||
byte[] bases = new byte[lengthByteArray];
|
||||
|
||||
while ((basesRead = readNextLine(bases, totalBasesRead)) != 0) {
|
||||
totalBasesRead += basesRead;
|
||||
|
||||
// Make sure we'll have space for the next iteration if we need it
|
||||
if (totalBasesRead == knownLength) {
|
||||
//System.out.printf("Read bases: %s%n", StringUtil.bytesToString(bases, totalBasesRead - basesRead, basesRead).trim());
|
||||
|
||||
assert peekOneByte() == -1 || peekOneByte() == '>' : "We somehow managed to read in enough bytes for the contig, but didn't pass through the entire contig";
|
||||
break;
|
||||
} else {
|
||||
final byte b = peekOneByte();
|
||||
if (b == -1 || b == '>') {
|
||||
break;
|
||||
}
|
||||
else if (totalBasesRead == bases.length) {
|
||||
tmp = new byte[bases.length * 2];
|
||||
System.arraycopy(bases, 0, tmp, 0, totalBasesRead);
|
||||
bases = tmp;
|
||||
tmp = null;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// And lastly resize the array down to the right size
|
||||
if (totalBasesRead != bases.length) {
|
||||
tmp = new byte[totalBasesRead];
|
||||
System.arraycopy(bases, 0, tmp, 0, totalBasesRead);
|
||||
bases = tmp;
|
||||
tmp = null;
|
||||
}
|
||||
|
||||
assert knownLength == -1 || knownLength == bases.length;
|
||||
|
||||
this.nextContigName = null; // we no longer know what the next contig name is
|
||||
|
||||
if ( DEBUG ) logger.debug(String.format(" => nextSequence() is returning %s, known length = %d%n", this.currentContigName, knownLength));
|
||||
if ( DEBUG ) logger.debug(String.format(" => nextSequence() next is %s%n", this.getNextContigName()));
|
||||
|
||||
return new ReferenceSequence(currentContigName, index, bases);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the next of the next contig, or null if there are no more contigs. Stateful function, it
|
||||
* remembers the name of the next contig. Use readNextContigName for stateless operation. This function
|
||||
* is primarily useful if you need to know what the next contig is in the stream for algorithmic purposes.
|
||||
*
|
||||
* Note that this function assumes the stream is sitting right at the end of the previous contig, or at the
|
||||
* beginning of the file.
|
||||
*
|
||||
* @return the name of the next contig, or null if there is no next contig
|
||||
*/
|
||||
public String getNextContigName() {
|
||||
if ( DEBUG ) logger.debug(String.format("getNextContigName() => %s%n", this.nextContigName));
|
||||
|
||||
if ( this.nextContigName == null ) {
|
||||
// If it's not null, we've already looked up the next contig name, just return it and happily continue
|
||||
// Otherwise we need to actually read in the name
|
||||
this.nextContigName = readNextContigName();
|
||||
}
|
||||
|
||||
if ( DEBUG ) logger.debug(String.format("nextContigName is now %s%n", nextContigName));
|
||||
return this.nextContigName;
|
||||
}
|
||||
|
||||
/**
|
||||
* Simply reads the next contig name from the fasta input stream. It assumes that the stream is positioned
|
||||
* immediately before the start of the next contig. Calling it without this condition met results in the
|
||||
* RuntimeIOException being thrown. This method advances the fasta stream itself -- subsequent calls to the
|
||||
* method will lead to errors. getNextContigName is the stateful version.
|
||||
*
|
||||
* @See getNextContigName()
|
||||
*
|
||||
* @return the string name of the next contig
|
||||
*/
|
||||
private String readNextContigName() {
|
||||
// Otherwise we need to actually read in the name
|
||||
byte[] tmp = new byte[4096];
|
||||
final int nameLength = readNextLine(tmp, 0);
|
||||
if (nameLength != 0) {
|
||||
// 0 means no more sequences!
|
||||
if ( tmp[0] != '>' )
|
||||
throw new RuntimeIOException("The next line is supposed to be a fasta contig start but found " + StringUtil.bytesToString(tmp, 0, nameLength).trim());
|
||||
|
||||
return StringUtil.bytesToString(tmp, 1, nameLength).trim();
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return The name of the contig we returned in the last call to nextSequence()
|
||||
*/
|
||||
public String getContigName() {
|
||||
return this.currentContigName;
|
||||
}
|
||||
|
||||
/**
|
||||
* Moves the IO stream to right before the next contig marker in the fasta file, for anywhere inside the
|
||||
* previous contig. It is primarily useful as a supplementary routine for jumping forward in the file by
|
||||
* N bases, since we don't know exactly how far way N bases in bytes will be in the file. So a guess jump
|
||||
* will put us somewhere before the target contig, and we use this routine to seek forward to the actual
|
||||
* contig we want.
|
||||
*
|
||||
* @return the name of the next contig, as a string
|
||||
*/
|
||||
public String seekForNextContig(final String targetContig ) {
|
||||
//System.out.printf("seekForNextContig()%n");
|
||||
|
||||
int basesRead;
|
||||
int totalBasesRead = 0;
|
||||
byte[] bases = new byte[4096];
|
||||
int i = 0;
|
||||
while ((basesRead = readNextLine(bases, 0)) != 0) {
|
||||
totalBasesRead += basesRead;
|
||||
|
||||
// Keep looking for the > marking the start of the line, and stop
|
||||
final byte b = peekOneByte();
|
||||
if (b == -1 || b == '>') {
|
||||
final String foundContig = readNextContigName();
|
||||
// System.out.printf("Found a contig name line %s%n", foundContig);
|
||||
final int foundIndex = this.sequenceDictionary.getSequenceIndex(foundContig);
|
||||
final int ourIndex = this.sequenceDictionary.getSequenceIndex(targetContig);
|
||||
|
||||
if ( foundIndex == ourIndex ) {
|
||||
// we found our target!
|
||||
this.nextContigName = foundContig; // store the right answer
|
||||
if ( DEBUG ) logger.debug(String.format("seekForNextContig found %s%n", foundContig));
|
||||
return foundContig;
|
||||
}
|
||||
else if ( foundIndex <= ourIndex )
|
||||
// we are still looking for our contig, the seek estimate was inaccurate relative to the size of contings in this area
|
||||
continue;
|
||||
else {
|
||||
// This is really bad -- we are past our target
|
||||
throw new RuntimeIOException(String.format("Seek pushes us past our target contig of %s, instead we found %s, which is after the target in the sequence dictions", targetContig, foundContig));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
/** Peeks one non line-terminating byte from the file and returns it. */
|
||||
private byte peekOneByte() {
|
||||
try {
|
||||
this.in.mark(16);
|
||||
byte b = '\n';
|
||||
while (b == '\n' || b == '\r') {
|
||||
b = (byte) this.in.read();
|
||||
}
|
||||
|
||||
this.in.reset();
|
||||
return b;
|
||||
}
|
||||
catch (IOException ioe) {
|
||||
throw new PicardException("Error reading from file: " + this.file.getAbsolutePath(), ioe);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads the next line from the file and puts the bytes into the buffer starting at
|
||||
* the provided index. Stops when the buffer is full, a line terminator is hit or
|
||||
* the end of the file is hit.
|
||||
*/
|
||||
private int readNextLine(final byte[] buffer, final int index) {
|
||||
try {
|
||||
int next;
|
||||
int read = 0;
|
||||
while ((next = this.in.read()) != -1 && index + read < buffer.length) {
|
||||
final byte b = (byte) next;
|
||||
if (b == '\r' || b == '\n') {
|
||||
if (read != 0) return read;
|
||||
}
|
||||
else {
|
||||
buffer[index + read++] = b;
|
||||
}
|
||||
}
|
||||
return read;
|
||||
}
|
||||
catch (IOException ioe) {
|
||||
throw new PicardException("Error reading line from file: " + this.file.getAbsolutePath(), ioe);
|
||||
}
|
||||
}
|
||||
|
||||
/** Returns the full path to the reference file. */
|
||||
public String toString() {
|
||||
return this.file.getAbsolutePath();
|
||||
}
|
||||
}
|
||||
|
|
@ -1,6 +1,7 @@
|
|||
package org.broadinstitute.sting.gatk.datasources.simpleDataSources;
|
||||
|
||||
import static junit.framework.Assert.fail;
|
||||
import net.sf.picard.reference.ReferenceSequenceFile;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.gatk.datasources.shards.Shard;
|
||||
|
|
@ -9,12 +10,13 @@ import org.broadinstitute.sting.gatk.datasources.shards.ShardStrategyFactory;
|
|||
import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
|
||||
import org.broadinstitute.sting.gatk.Reads;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.fasta.FastaSequenceFile2;
|
||||
import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile;
|
||||
import org.junit.After;
|
||||
import org.junit.Before;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
|
|
@ -47,7 +49,7 @@ import java.util.List;
|
|||
public class SAMBAMDataSourceTest extends BaseTest {
|
||||
|
||||
private List<File> fl;
|
||||
private FastaSequenceFile2 seq;
|
||||
private ReferenceSequenceFile seq;
|
||||
|
||||
/**
|
||||
* This function does the setup of our parser, before each method call.
|
||||
|
|
@ -55,11 +57,11 @@ public class SAMBAMDataSourceTest extends BaseTest {
|
|||
* Called before every test case method.
|
||||
*/
|
||||
@Before
|
||||
public void doForEachTest() {
|
||||
public void doForEachTest() throws FileNotFoundException {
|
||||
fl = new ArrayList<File>();
|
||||
|
||||
// sequence
|
||||
seq = new FastaSequenceFile2(new File(seqLocation + "/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta"));
|
||||
seq = new IndexedFastaSequenceFile(new File(seqLocation + "/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta"));
|
||||
GenomeLocParser.setupRefContigOrdering(seq.getSequenceDictionary());
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -2,6 +2,7 @@ package org.broadinstitute.sting.gatk.iterators;
|
|||
|
||||
import static junit.framework.Assert.fail;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import net.sf.picard.reference.ReferenceSequenceFile;
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.gatk.datasources.shards.Shard;
|
||||
import org.broadinstitute.sting.gatk.datasources.shards.ShardStrategy;
|
||||
|
|
@ -10,13 +11,14 @@ import org.broadinstitute.sting.gatk.datasources.simpleDataSources.SAMDataSource
|
|||
import org.broadinstitute.sting.gatk.datasources.simpleDataSources.SimpleDataSourceLoadException;
|
||||
import org.broadinstitute.sting.gatk.Reads;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.fasta.FastaSequenceFile2;
|
||||
import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile;
|
||||
import static org.junit.Assert.assertEquals;
|
||||
import static org.junit.Assert.assertTrue;
|
||||
import org.junit.Before;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
|
|
@ -60,7 +62,7 @@ public class BoundedReadIteratorTest extends BaseTest {
|
|||
|
||||
/** the file list and the fasta sequence */
|
||||
private List<File> fl;
|
||||
private FastaSequenceFile2 seq;
|
||||
private ReferenceSequenceFile seq;
|
||||
|
||||
/**
|
||||
* This function does the setup of our parser, before each method call.
|
||||
|
|
@ -68,11 +70,11 @@ public class BoundedReadIteratorTest extends BaseTest {
|
|||
* Called before every test case method.
|
||||
*/
|
||||
@Before
|
||||
public void doForEachTest() {
|
||||
public void doForEachTest() throws FileNotFoundException {
|
||||
fl = new ArrayList<File>();
|
||||
|
||||
// sequence
|
||||
seq = new FastaSequenceFile2(new File(seqLocation + "/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta"));
|
||||
seq = new IndexedFastaSequenceFile(new File(seqLocation + "/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta"));
|
||||
GenomeLocParser.setupRefContigOrdering(seq.getSequenceDictionary());
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -7,7 +7,7 @@ package org.broadinstitute.sting.gatk.refdata;
|
|||
import org.junit.*;
|
||||
import static org.junit.Assert.assertTrue;
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.utils.fasta.FastaSequenceFile2;
|
||||
import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
|
||||
import java.io.File;
|
||||
|
|
@ -17,20 +17,22 @@ import java.io.FileNotFoundException;
|
|||
import java.util.Arrays;
|
||||
import java.util.ArrayList;
|
||||
|
||||
import net.sf.picard.reference.ReferenceSequenceFile;
|
||||
|
||||
/**
|
||||
* Basic unit test for TabularROD
|
||||
*
|
||||
*/
|
||||
public class TabularRODTest extends BaseTest {
|
||||
private static FastaSequenceFile2 seq;
|
||||
private static ReferenceSequenceFile seq;
|
||||
private ReferenceOrderedData ROD;
|
||||
private RODIterator iter;
|
||||
|
||||
|
||||
@BeforeClass
|
||||
public static void init() {
|
||||
public static void init() throws FileNotFoundException {
|
||||
// sequence
|
||||
seq = new FastaSequenceFile2(new File(seqLocation + "/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta"));
|
||||
seq = new IndexedFastaSequenceFile(new File(seqLocation + "/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta"));
|
||||
GenomeLocParser.setupRefContigOrdering(seq);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -12,7 +12,6 @@ import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
|
|||
import org.broadinstitute.sting.gatk.walkers.CountReadsWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.Walker;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.fasta.FastaSequenceFile2;
|
||||
import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile;
|
||||
import static org.junit.Assert.fail;
|
||||
import org.junit.Before;
|
||||
|
|
@ -26,6 +25,8 @@ import java.lang.reflect.Field;
|
|||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import net.sf.picard.reference.ReferenceSequenceFile;
|
||||
|
||||
/**
|
||||
*
|
||||
* User: aaron
|
||||
|
|
@ -54,7 +55,7 @@ import java.util.List;
|
|||
*/
|
||||
public class TraverseReadsTest extends BaseTest {
|
||||
|
||||
private FastaSequenceFile2 seq;
|
||||
private ReferenceSequenceFile seq;
|
||||
private File bam = new File("/humgen/gsa-scr1/GATK_Data/Validation_Data/index_test.bam"); // TCGA-06-0188.aligned.duplicates_marked.bam");
|
||||
private File refFile = new File("/humgen/gsa-scr1/GATK_Data/Validation_Data/Homo_sapiens_assembly17.fasta");
|
||||
private List<File> bamList;
|
||||
|
|
|
|||
|
|
@ -8,20 +8,23 @@ import org.junit.Assert;
|
|||
import org.junit.BeforeClass;
|
||||
import org.junit.Test;
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.utils.fasta.FastaSequenceFile2;
|
||||
import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
|
||||
import net.sf.picard.reference.ReferenceSequenceFile;
|
||||
|
||||
/**
|
||||
* Basic unit test for GenomeLoc
|
||||
*/
|
||||
public class GenomeLocTest extends BaseTest {
|
||||
private static FastaSequenceFile2 seq;
|
||||
private static ReferenceSequenceFile seq;
|
||||
|
||||
@BeforeClass
|
||||
public static void init() {
|
||||
public static void init() throws FileNotFoundException {
|
||||
// sequence
|
||||
seq = new FastaSequenceFile2(new File(seqLocation + "/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta"));
|
||||
seq = new IndexedFastaSequenceFile(new File(seqLocation + "/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta"));
|
||||
GenomeLocParser.setupRefContigOrdering(seq);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -5,16 +5,19 @@ package org.broadinstitute.sting.utils;
|
|||
// the imports for unit testing.
|
||||
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.utils.fasta.FastaSequenceFile2;
|
||||
import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile;
|
||||
import static org.junit.Assert.assertTrue;
|
||||
import org.junit.Before;
|
||||
import org.junit.BeforeClass;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
import net.sf.picard.reference.ReferenceSequenceFile;
|
||||
|
||||
/**
|
||||
* Basic unit test for RefHanger
|
||||
*
|
||||
|
|
@ -50,7 +53,7 @@ import java.util.List;
|
|||
|
||||
*/
|
||||
public class RefHangerTest extends BaseTest {
|
||||
private static FastaSequenceFile2 seq;
|
||||
private static ReferenceSequenceFile seq;
|
||||
private GenomeLoc startLoc;
|
||||
private RefHanger<Integer> emptyHanger;
|
||||
|
||||
|
|
@ -67,9 +70,9 @@ public class RefHangerTest extends BaseTest {
|
|||
private static GenomeLoc p1, p2, p3, p4, p5;
|
||||
|
||||
@BeforeClass
|
||||
public static void init() {
|
||||
public static void init() throws FileNotFoundException {
|
||||
// sequence
|
||||
seq = new FastaSequenceFile2(new File(seqLocation + "/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta"));
|
||||
seq = new IndexedFastaSequenceFile(new File(seqLocation + "/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta"));
|
||||
GenomeLocParser.setupRefContigOrdering(seq);
|
||||
|
||||
System.out.printf("Filled hanger is %n%s%n", makeFilledHanger());
|
||||
|
|
|
|||
|
|
@ -1,197 +0,0 @@
|
|||
package org.broadinstitute.sting.utils.fasta;
|
||||
|
||||
import net.sf.picard.reference.ReferenceSequence;
|
||||
import net.sf.samtools.util.StringUtil;
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.junit.*;
|
||||
|
||||
import java.io.File;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: mhanna
|
||||
* Date: Apr 11, 2009
|
||||
* Time: 2:32:52 PM
|
||||
*/
|
||||
public class FastaSequenceFile2Test extends BaseTest {
|
||||
|
||||
private static String sequenceFileName;
|
||||
private FastaSequenceFile2 sequenceFile = null;
|
||||
|
||||
private final String firstBasesOfChrM = "GATCACAGGTCTATCACCCT";
|
||||
private final String firstBasesOfChr1 = "taaccctaaccctaacccta";
|
||||
private final String firstBasesOfChr8 = "GCAATTATGACACAAAAAAT";
|
||||
|
||||
@BeforeClass
|
||||
public static void initialize() {
|
||||
sequenceFileName = seqLocation + "/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta";
|
||||
}
|
||||
|
||||
@Before
|
||||
public void doForEachTest() {
|
||||
sequenceFile = new FastaSequenceFile2( new File(sequenceFileName) );
|
||||
}
|
||||
|
||||
/**
|
||||
* Tears down the test fixture after each call.
|
||||
* <p/>
|
||||
* Called after every test case method.
|
||||
*/
|
||||
@After
|
||||
public void undoForEachTest() {
|
||||
sequenceFile = null;
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testOpenFile() {
|
||||
logger.warn("Executing testOpenFile");
|
||||
|
||||
long startTime = System.currentTimeMillis();
|
||||
Assert.assertNotNull( sequenceFile );
|
||||
long endTime = System.currentTimeMillis();
|
||||
|
||||
System.err.printf("testOpenFile runtime: %dms%n", (endTime - startTime)) ;
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testFirstSequence() {
|
||||
logger.warn("Executing testFirstSequence");
|
||||
|
||||
long startTime = System.currentTimeMillis();
|
||||
ReferenceSequence sequence = sequenceFile.nextSequence();
|
||||
Assert.assertEquals("First sequence contig is not correct", sequence.getName(), "chrM");
|
||||
Assert.assertEquals( "First n bases of chrM are incorrect",
|
||||
StringUtil.bytesToString( sequence.getBases(), 0, firstBasesOfChrM.length() ),
|
||||
firstBasesOfChrM );
|
||||
long endTime = System.currentTimeMillis();
|
||||
|
||||
System.err.printf("testFirstSequence runtime: %dms%n", (endTime - startTime)) ;
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testNextSequence() {
|
||||
logger.warn("Executing testNextSequence");
|
||||
|
||||
long startTime = System.currentTimeMillis();
|
||||
|
||||
ReferenceSequence sequence = null;
|
||||
|
||||
// Advance to chrM.
|
||||
sequence = sequenceFile.nextSequence();
|
||||
sequence = sequenceFile.nextSequence();
|
||||
|
||||
Assert.assertEquals("First sequence contig is not correct", sequence.getName(), "chr1");
|
||||
|
||||
// Workaround: bytesToString for chr1 of the fasta file we've picked doesn't appear to work.
|
||||
// TODO: Report this as sam-jdk bug.
|
||||
byte[] firstOfChr1 = StringUtil.stringToBytes(firstBasesOfChr1);
|
||||
byte[] firstOfSequence = new byte[firstBasesOfChr1.length()];
|
||||
System.arraycopy(sequence.getBases(), 0, firstOfSequence, 0, firstOfSequence.length );
|
||||
|
||||
Assert.assertArrayEquals("First bases of chr1 are not correct", firstOfChr1, firstOfSequence );
|
||||
|
||||
long endTime = System.currentTimeMillis();
|
||||
|
||||
System.err.printf("testNextSequence runtime: %dms%n", (endTime - startTime)) ;
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSeekToSequence() {
|
||||
logger.warn("Executing testSeekToSequence");
|
||||
|
||||
long startTime = System.currentTimeMillis();
|
||||
|
||||
boolean success = sequenceFile.seekToContig("chr8");
|
||||
Assert.assertTrue("Seek to seq chr8 failed", success );
|
||||
|
||||
ReferenceSequence sequence = sequenceFile.nextSequence();
|
||||
Assert.assertEquals("First sequence contig is not correct", sequence.getName(), "chr8");
|
||||
Assert.assertEquals( "First n bases of chrc are incorrect",
|
||||
StringUtil.bytesToString( sequence.getBases(), 0, firstBasesOfChr8.length() ),
|
||||
firstBasesOfChr8 );
|
||||
|
||||
long endTime = System.currentTimeMillis();
|
||||
|
||||
System.err.printf("testSeekToSequence runtime: %dms%n", (endTime - startTime)) ;
|
||||
}
|
||||
|
||||
// TODO: Is NullPointerException *really* the right exception when a sequence is missing?
|
||||
@Test(expected=NullPointerException.class)
|
||||
public void testSeekToMissingSequence() {
|
||||
logger.warn("Executing testSeekToMissingSequence");
|
||||
|
||||
long startTime = 0L, endTime = 0L;
|
||||
|
||||
try {
|
||||
startTime = System.currentTimeMillis();
|
||||
boolean success = sequenceFile.seekToContig("absent");
|
||||
}
|
||||
finally {
|
||||
endTime = System.currentTimeMillis();
|
||||
System.err.printf("testSeekToMissingSequence runtime: %dms%n", (endTime - startTime)) ;
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSeekBackward() {
|
||||
logger.warn("Executing testSeekBackward");
|
||||
|
||||
long startTime = System.currentTimeMillis();
|
||||
|
||||
boolean success = sequenceFile.seekToContig("chr9");
|
||||
Assert.assertTrue("Unable to seek to contig 'chr9'", success);
|
||||
|
||||
success = sequenceFile.seekToContig("chr8",true);
|
||||
Assert.assertTrue("Unable to seek backward to contig 'chr8'", success);
|
||||
|
||||
ReferenceSequence sequence = sequenceFile.nextSequence();
|
||||
Assert.assertEquals("First sequence contig is not correct", sequence.getName(), "chr8");
|
||||
Assert.assertEquals( "First n bases of chrc are incorrect",
|
||||
StringUtil.bytesToString( sequence.getBases(), 0, firstBasesOfChr8.length() ),
|
||||
firstBasesOfChr8 );
|
||||
|
||||
long endTime = System.currentTimeMillis();
|
||||
|
||||
System.err.printf("testSeekBackward runtime: %dms%n", (endTime - startTime)) ;
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testInvalidSeekBackward() {
|
||||
logger.warn("Executing testInvalidSeekBackward");
|
||||
|
||||
long startTime = System.currentTimeMillis();
|
||||
|
||||
boolean success = sequenceFile.seekToContig("chr9");
|
||||
Assert.assertTrue("Unable to seek to contig 'chr9'", success);
|
||||
|
||||
success = sequenceFile.seekToContig("chr8");
|
||||
Assert.assertFalse("Unable to seek backward to contig 'chr8'", success);
|
||||
|
||||
long endTime = System.currentTimeMillis();
|
||||
|
||||
System.err.printf("testInvalidSeekBackward runtime: %dms%n", (endTime - startTime)) ;
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSimultaneousAccess() {
|
||||
logger.warn("Executing testSimultaneousAccess");
|
||||
|
||||
long startTime = System.currentTimeMillis();
|
||||
|
||||
// FastaSequenceFile2 other = (FastaSequenceFile2)sequenceFile.clone();
|
||||
|
||||
sequenceFile.seekToContig("chr1");
|
||||
ReferenceSequence chr1 = sequenceFile.nextSequence();
|
||||
|
||||
// other.seekToContig("chr8");
|
||||
// ReferenceSequence chr8 = other.nextSequence();
|
||||
|
||||
// System.err.printf( "sequenceFile contig: %s%n", sequenceFile.getContigName() );
|
||||
// System.err.printf( "other contig: %s%n", other.getContigName() );
|
||||
|
||||
long endTime = System.currentTimeMillis();
|
||||
|
||||
System.err.printf("testSimultaneousAccess runtime: %dms%n", (endTime - startTime)) ;
|
||||
}
|
||||
}
|
||||
|
|
@ -10,6 +10,8 @@ import java.io.File;
|
|||
import java.io.FileNotFoundException;
|
||||
|
||||
import net.sf.picard.reference.ReferenceSequence;
|
||||
import net.sf.picard.reference.ReferenceSequenceFile;
|
||||
import net.sf.picard.reference.ReferenceSequenceFileFactory;
|
||||
import net.sf.picard.PicardException;
|
||||
import net.sf.samtools.util.StringUtil;
|
||||
|
||||
|
|
@ -122,7 +124,7 @@ public class IndexedFastaSequenceFileTest extends BaseTest {
|
|||
|
||||
@Test
|
||||
public void testFirstCompleteContigRead() {
|
||||
FastaSequenceFile2 originalSequenceFile = new FastaSequenceFile2(new File(sequenceFileName));
|
||||
ReferenceSequenceFile originalSequenceFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(new File(sequenceFileName));
|
||||
ReferenceSequence expectedSequence = originalSequenceFile.nextSequence();
|
||||
|
||||
long startTime = System.currentTimeMillis();
|
||||
|
|
@ -164,9 +166,10 @@ public class IndexedFastaSequenceFileTest extends BaseTest {
|
|||
|
||||
@Test
|
||||
public void testMiddleCompleteContigRead() {
|
||||
FastaSequenceFile2 originalSequenceFile = new FastaSequenceFile2(new File(sequenceFileName));
|
||||
originalSequenceFile.seekToContig("chrY");
|
||||
ReferenceSequenceFile originalSequenceFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(new File(sequenceFileName));
|
||||
ReferenceSequence expectedSequence = originalSequenceFile.nextSequence();
|
||||
while( !expectedSequence.getName().equals("chrY") )
|
||||
expectedSequence = originalSequenceFile.nextSequence();
|
||||
|
||||
long startTime = System.currentTimeMillis();
|
||||
ReferenceSequence sequence = sequenceFile.getSequence("chrY");
|
||||
|
|
@ -183,9 +186,10 @@ public class IndexedFastaSequenceFileTest extends BaseTest {
|
|||
|
||||
@Test
|
||||
public void testLastCompleteContigRead() {
|
||||
FastaSequenceFile2 originalSequenceFile = new FastaSequenceFile2(new File(sequenceFileName));
|
||||
originalSequenceFile.seekToContig("chrX_random");
|
||||
ReferenceSequenceFile originalSequenceFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(new File(sequenceFileName));
|
||||
ReferenceSequence expectedSequence = originalSequenceFile.nextSequence();
|
||||
while( !expectedSequence.getName().equals("chrX_random") )
|
||||
expectedSequence = originalSequenceFile.nextSequence();
|
||||
|
||||
long startTime = System.currentTimeMillis();
|
||||
ReferenceSequence sequence = sequenceFile.getSequence("chrX_random");
|
||||
|
|
@ -233,7 +237,7 @@ public class IndexedFastaSequenceFileTest extends BaseTest {
|
|||
|
||||
@Test
|
||||
public void testFirstElementOfIterator() {
|
||||
FastaSequenceFile2 originalSequenceFile = new FastaSequenceFile2(new File(sequenceFileName));
|
||||
ReferenceSequenceFile originalSequenceFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(new File(sequenceFileName));
|
||||
ReferenceSequence expectedSequence = originalSequenceFile.nextSequence();
|
||||
|
||||
long startTime = System.currentTimeMillis();
|
||||
|
|
@ -251,7 +255,7 @@ public class IndexedFastaSequenceFileTest extends BaseTest {
|
|||
|
||||
@Test
|
||||
public void testNextElementOfIterator() {
|
||||
FastaSequenceFile2 originalSequenceFile = new FastaSequenceFile2(new File(sequenceFileName));
|
||||
ReferenceSequenceFile originalSequenceFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(new File(sequenceFileName));
|
||||
// Skip past the first one and load the second one.
|
||||
originalSequenceFile.nextSequence();
|
||||
ReferenceSequence expectedSequence = originalSequenceFile.nextSequence();
|
||||
|
|
|
|||
Loading…
Reference in New Issue